[RFC internal v2 0/4] scheduler-based cpu frequency scaling

List overview All Threads
Download

newer

older

Updates to cpu capacity...

[PATCH 0/4] scheduler-based cpu...

Michael Turquette

27 Apr 2015 27 Apr '15

7:46 a.m.

This series implements an event-driven cpufreq governor that scales cpu frequency as a function of cfs runqueue utilization. The intent of this RFC is to get some discussion going about how the scheduler can become the policy engine for selecting cpu frequency, what limitations exist and what design do we want to take to get to a solution.

This series depends on having frequency-invariant representations for load. This requires Vincent's recently merged cpu capacity rework patches, as well as a new patch from Morten included here. Morten's patch will likely make an appearance in his energy aware scheduling v4 series.

Thanks to Juri Lelli juri.lelli@arm.com for contributing to the development of the governor.

A git branch with these patches can be pulled from here: https://git.linaro.org/people/mike.turquette/linux.git sched-freq

Smoke testing has been done on an OMAP4 Pandaboard and an Exynos 5800 Chromebook2. Extensive benchmarking and regression testing has not yet been done. Before sinking too much time into extensive testing I'd like to get feedback on the general design.

Michael Turquette (3): sched: sched feature for cpu frequency selection sched: export get_cpu_usage & capacity_orig_of sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Morten Rasmussen (1): arm: Frequency invariant scheduler load-tracking support

-- 1.9.1

Show replies by date

Michael Turquette

27 Apr 27 Apr

7:46 a.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:

current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com --- changes since internal v1: * replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53 +++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity +struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); + #else

static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq); +DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity); + +/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling through arch_scale_freq_capacity() + * (implemented in topology.c). + */ +static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{ + unsigned long capacity; + + if (!max) + return; + + capacity = (curr << SCHED_CAPACITY_SHIFT) / max; + atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity); +}

static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu; + unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));

if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK; @@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); } + + scale_freq_capacity(cpu, freq->new, max); + return NOTIFY_OK; }

@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };

+static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int i; + + for_each_cpu(i, policy->cpus) { + scale_freq_capacity(i, policy->cur, policy->max); + atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max); + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + static int __init register_cpufreq_notifier(void) { - return cpufreq_register_notifier(&cpufreq_notifier, + int ret; + + ret = cpufreq_register_notifier(&cpufreq_notifier, CPUFREQ_TRANSITION_NOTIFIER); + if (ret) + return ret; + + return cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); } core_initcall(register_cpufreq_notifier); - #endif diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }

+/* + * Scheduler load-tracking scale-invariance + * + * Provides the scheduler with a scale-invariance correction factor that + * compensates for frequency scaling (arch_scale_freq_capacity()). The scaling + * factor is updated in smp.c + */ +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity, cpu)); + + if (!curr) + return SCHED_CAPACITY_SCALE; + + return curr; +} + #else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {} -- 1.9.1

Daniel Lezcano

29 Apr 29 Apr

10:34 a.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...

From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:

current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53 +++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity

What is for this macro ?

...

+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);

+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);

IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.

eg.

sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);

...

#else

static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);

In the code cpu_max_freq is used to update the scale freq invariance.

Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?

...

+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling through arch_scale_freq_capacity()

(implemented in topology.c).

*/

+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
unsigned long capacity;

if (!max)
return;
capacity = (curr << SCHED_CAPACITY_SHIFT) / max;

atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}

static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;

unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));

if (freq->flags & CPUFREQ_CONST_LOOPS) return NOTIFY_OK;

@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }

scale_freq_capacity(cpu, freq->new, max);

scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...

return NOTIFY_OK; }

@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };

+static int cpufreq_policy_callback(struct notifier_block *nb,
				unsigned long val, void *data)
+{
struct cpufreq_policy *policy = data;

int i;

for_each_cpu(i, policy->cpus) {
scale_freq_capacity(i, policy->cur, policy->max);

scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...

atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);

atomic_long_set no longer needed.

...

}

return NOTIFY_OK;

+}

+static struct notifier_block cpufreq_policy_notifier = {

.notifier_call = cpufreq_policy_callback,

+};

static int __init register_cpufreq_notifier(void) {

return cpufreq_register_notifier(&cpufreq_notifier,
int ret;

ret = cpufreq_register_notifier(&cpufreq_notifier, CPUFREQ_TRANSITION_NOTIFIER);

if (ret)
return ret;
return cpufreq_register_notifier(&cpufreq_policy_notifier,
				CPUFREQ_POLICY_NOTIFIER);
} core_initcall(register_cpufreq_notifier);
#endif

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling (arch_scale_freq_capacity()). The scaling

factor is updated in smp.c

*/

+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity, cpu));

if (!curr)
return SCHED_CAPACITY_SCALE;

Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?

...

return curr;

+}

#else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Vincent Guittot

11:15 a.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
    current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53

+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?

This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter

...

...
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);

+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);

IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.

eg.

sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);

Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler

...

...
#else

static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);

In the code cpu_max_freq is used to update the scale freq invariance.

Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?

...
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling through arch_scale_freq_capacity()

(implemented in topology.c).

*/

+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
  unsigned long capacity;
  if (!max)
          return;
  capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
  atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}

static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
  unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));

  if (freq->flags & CPUFREQ_CONST_LOOPS)
          return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
  scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
  return NOTIFY_OK;
}
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };

+static int cpufreq_policy_callback(struct notifier_block *nb,
                                          unsigned long val, void
*data) +{
  struct cpufreq_policy *policy = data;
  int i;
  for_each_cpu(i, policy->cpus) {
          scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
          atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.

...
  }
  return NOTIFY_OK;
+}

+static struct notifier_block cpufreq_policy_notifier = {
  .notifier_call  = cpufreq_policy_callback,
+};

static int __init register_cpufreq_notifier(void) {
  return cpufreq_register_notifier(&cpufreq_notifier,
  int ret;
  ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
  if (ret)
          return ret;
  return cpufreq_register_notifier(&cpufreq_policy_notifier,
                                          CPUFREQ_POLICY_NOTIFIER);
} core_initcall(register_cpufreq_notifier);
#endif

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling (arch_scale_freq_capacity()). The

scaling

factor is updated in smp.c

*/

+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
  unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity,
cpu));
  if (!curr)
          return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?

...
  return curr;
+}

I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file

...

...

#else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Daniel Lezcano

12:22 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...

On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
     current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53

+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter

What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?

...

...
...
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);

+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);

IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.

eg.

sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);

Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler

...
...
#else

static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);

In the code cpu_max_freq is used to update the scale freq invariance.

Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?

...
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling through arch_scale_freq_capacity()

(implemented in topology.c).

*/

+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
  unsigned long capacity;
  if (!max)
          return;
  capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
  atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}

static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
  unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq, cpu));

   if (freq->flags & CPUFREQ_CONST_LOOPS)
           return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
  scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
   return NOTIFY_OK;
}
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };

+static int cpufreq_policy_callback(struct notifier_block *nb,
                                          unsigned long val, void
*data) +{
  struct cpufreq_policy *policy = data;
  int i;
  for_each_cpu(i, policy->cpus) {
          scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
          atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.

...
  }
  return NOTIFY_OK;
+}

+static struct notifier_block cpufreq_policy_notifier = {
  .notifier_call  = cpufreq_policy_callback,
+};

static int __init register_cpufreq_notifier(void) {
  return cpufreq_register_notifier(&cpufreq_notifier,
  int ret;
  ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
  if (ret)
          return ret;
  return cpufreq_register_notifier(&cpufreq_policy_notifier,
                                          CPUFREQ_POLICY_NOTIFIER);
} core_initcall(register_cpufreq_notifier);
#endif

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor that

compensates for frequency scaling (arch_scale_freq_capacity()). The

scaling

factor is updated in smp.c

*/

+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
  unsigned long curr = atomic_long_read(&per_cpu(cpu_freq_capacity,
cpu));
  if (!curr)
          return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?

...
  return curr;
+}
I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file

...
...

#else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Vincent Guittot

12:34 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...

On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
     current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53

+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?

You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

...

...
...
...
+struct sched_domain; +extern +unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu);

+DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity);

IMO cpu_freq_capacity should be statically declared in the core code and modified/inspected through accessors also in the core code.

eg.

sched_cpu_freq_capacity_set(int cpu, unsigned long freq_capacity); unsigned long sched_cpu_freq_capacity_get(int cpu);

Peter asked that arm_arch_scale_freq_capacity should not add any additional instruction if not used by an arch because it's on the very hot path of the scheduler

...
...
#else

static inline void init_cpu_topology(void) { } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 86ef244..297ce1b 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -672,12 +672,34 @@ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); static DEFINE_PER_CPU(unsigned long, l_p_j_ref_freq); static unsigned long global_l_p_j_ref; static unsigned long global_l_p_j_ref_freq; +static DEFINE_PER_CPU(atomic_long_t, cpu_max_freq);

In the code cpu_max_freq is used to update the scale freq invariance.

Wouldn't be simpler to use directly: cpufreq_quick_get_max(int cpu) instead of declaring another per cpu variable ?

...
+DEFINE_PER_CPU(atomic_long_t, cpu_freq_capacity);

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor

that

compensates for frequency scaling through arch_scale_freq_capacity()

(implemented in topology.c).

*/

+static inline +void scale_freq_capacity(int cpu, unsigned long curr, unsigned long max) +{
  unsigned long capacity;
  if (!max)
          return;
  capacity = (curr << SCHED_CAPACITY_SHIFT) / max;
  atomic_long_set(&per_cpu(cpu_freq_capacity, cpu), capacity);
+}

static int cpufreq_callback(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; int cpu = freq->cpu;
  unsigned long max = atomic_long_read(&per_cpu(cpu_max_freq,
cpu));
     if (freq->flags & CPUFREQ_CONST_LOOPS)
             return NOTIFY_OK;
@@ -702,6 +724,9 @@ static int cpufreq_callback(struct notifier_block *nb, per_cpu(l_p_j_ref_freq, cpu), freq->new); }
  scale_freq_capacity(cpu, freq->new, max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
   return NOTIFY_OK;
}
@@ -709,11 +734,35 @@ static struct notifier_block cpufreq_notifier = { .notifier_call = cpufreq_callback, };

+static int cpufreq_policy_callback(struct notifier_block *nb,
                                          unsigned long val, void
*data) +{
  struct cpufreq_policy *policy = data;
  int i;
  for_each_cpu(i, policy->cpus) {
          scale_freq_capacity(i, policy->cur, policy->max);
scale_freq_capacity(cpu, cpufreq_quick_get_max(cpu)) ?

...
          atomic_long_set(&per_cpu(cpu_max_freq, i), policy->max);
atomic_long_set no longer needed.

...
  }
  return NOTIFY_OK;
+}

+static struct notifier_block cpufreq_policy_notifier = {
  .notifier_call  = cpufreq_policy_callback,
+};

static int __init register_cpufreq_notifier(void) {
  return cpufreq_register_notifier(&cpufreq_notifier,
  int ret;
  ret = cpufreq_register_notifier(&cpufreq_notifier,
CPUFREQ_TRANSITION_NOTIFIER);
  if (ret)
          return ret;
  return cpufreq_register_notifier(&cpufreq_policy_notifier,
CPUFREQ_POLICY_NOTIFIER); } core_initcall(register_cpufreq_notifier);

#endif

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847..9c09e6e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -169,6 +169,23 @@ static void update_cpu_capacity(unsigned int cpu) cpu, arch_scale_cpu_capacity(NULL, cpu)); }

+/*

Scheduler load-tracking scale-invariance

Provides the scheduler with a scale-invariance correction factor

that

compensates for frequency scaling (arch_scale_freq_capacity()). The

scaling

factor is updated in smp.c

*/

+unsigned long arm_arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{
  unsigned long curr =
atomic_long_read(&per_cpu(cpu_freq_capacity, cpu));
  if (!curr)
          return SCHED_CAPACITY_SCALE;
Why not initialized 'cpu_freq_capacity' with the right value, so !curr won't happen ?

...
  return curr;
+}
I wonder if you should better move arm_arch_scale_freq_capacity in the arch/arm/kernel/smp.c as all other fucntion and variable are there. This will allow you to remove DECLARE_PER_CPU(atomic_long_t, cpu_freq_capacity); form the topology.h file

...
...

#else static inline void parse_dt_topology(void) {} static inline void update_cpu_capacity(unsigned int cpuid) {}

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Daniel Lezcano

1:10 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...

On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
      current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53

+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Vincent Guittot

1:17 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...

On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
From: Morten Rasmussen Morten.Rasmussen@arm.com

Implements arch-specific function to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is:
      current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)
This implementation only provides frequency invariance. No micro-architecture invariance yet.

Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com

changes since internal v1:

replaced two commits from eas v3 with this new one from Morten

arch/arm/include/asm/topology.h | 7 ++++++ arch/arm/kernel/smp.c | 53

+++++++++++++++++++++++++++++++++++++++-- arch/arm/kernel/topology.c | 17 +++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2fe85ff..4b985dc 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu);

+#define arch_scale_freq_capacity arm_arch_scale_freq_capacity
What is for this macro ?
This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter
What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?
You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113
Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

the weak function adds the useless sequence: value *= 1024 value >>=10

whereas the macro doesn't add any additional instruction

...

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Daniel Lezcano

1:27 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 04/29/2015 03:17 PM, Vincent Guittot wrote:

...

On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/27/2015 09:46 AM, Michael Turquette wrote: > > > > From: Morten Rasmussen Morten.Rasmussen@arm.com > > Implements arch-specific function to provide the scheduler with a > frequency scaling correction factor for more accurate load-tracking. > The > factor is: > > current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) > > This implementation only provides frequency invariance. No > micro-architecture invariance yet. > > Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com > --- > changes since internal v1: > * replaced two commits from eas v3 with this new one from Morten > > arch/arm/include/asm/topology.h | 7 ++++++ > arch/arm/kernel/smp.c | 53 > +++++++++++++++++++++++++++++++++++++++-- > arch/arm/kernel/topology.c | 17 +++++++++++++ > 3 files changed, 75 insertions(+), 2 deletions(-) > > diff --git a/arch/arm/include/asm/topology.h > b/arch/arm/include/asm/topology.h > index 2fe85ff..4b985dc 100644 > --- a/arch/arm/include/asm/topology.h > +++ b/arch/arm/include/asm/topology.h > @@ -24,6 +24,13 @@ void init_cpu_topology(void); > void store_cpu_topology(unsigned int cpuid); > const struct cpumask *cpu_coregroup_mask(int cpu); > > +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity

What is for this macro ?

This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter

What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?

You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

the weak function adds the useless sequence: value *= 1024 value >>=10

whereas the macro doesn't add any additional instruction

So if the macro is not defined for the architecture, the compilation will fail.

I don't see why the function below is not right:

void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; }

...

...
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Vincent Guittot

1:33 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...

On 04/29/2015 03:17 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 01:15 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org wrote: > > > > On 04/27/2015 09:46 AM, Michael Turquette wrote: >> >> >> >> >> From: Morten Rasmussen Morten.Rasmussen@arm.com >> >> Implements arch-specific function to provide the scheduler with a >> frequency scaling correction factor for more accurate load-tracking. >> The >> factor is: >> >> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >> >> This implementation only provides frequency invariance. No >> micro-architecture invariance yet. >> >> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >> --- >> changes since internal v1: >> * replaced two commits from eas v3 with this new one from Morten >> >> arch/arm/include/asm/topology.h | 7 ++++++ >> arch/arm/kernel/smp.c | 53 >> +++++++++++++++++++++++++++++++++++++++-- >> arch/arm/kernel/topology.c | 17 +++++++++++++ >> 3 files changed, 75 insertions(+), 2 deletions(-) >> >> diff --git a/arch/arm/include/asm/topology.h >> b/arch/arm/include/asm/topology.h >> index 2fe85ff..4b985dc 100644 >> --- a/arch/arm/include/asm/topology.h >> +++ b/arch/arm/include/asm/topology.h >> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >> void store_cpu_topology(unsigned int cpuid); >> const struct cpumask *cpu_coregroup_mask(int cpu); >> >> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity > > > > > > What is for this macro ?

This is used to doesn't add any useless computation in the hot path when arch_scale_freq_capacity is not used. This has been asked by peter

What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?

You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

the weak function adds the useless sequence: value *= 1024 value >>=10

whereas the macro doesn't add any additional instruction

So if the macro is not defined for the architecture, the compilation will fail.

no there is a test in sched.h

#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif

...

I don't see why the function below is not right:

void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;

}

AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler. I had done several tests during the thread discussion and only the chosen solution was able to remove the useless sequence value *= 1024 value >>=10

...

...
...
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Daniel Lezcano

2:12 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On 04/29/2015 03:33 PM, Vincent Guittot wrote:

...

On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 03:17 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 01:15 PM, Vincent Guittot wrote: > > > > On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org > wrote: >> >> >> >> On 04/27/2015 09:46 AM, Michael Turquette wrote: >>> >>> >>> >>> >>> From: Morten Rasmussen Morten.Rasmussen@arm.com >>> >>> Implements arch-specific function to provide the scheduler with a >>> frequency scaling correction factor for more accurate load-tracking. >>> The >>> factor is: >>> >>> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >>> >>> This implementation only provides frequency invariance. No >>> micro-architecture invariance yet. >>> >>> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >>> --- >>> changes since internal v1: >>> * replaced two commits from eas v3 with this new one from Morten >>> >>> arch/arm/include/asm/topology.h | 7 ++++++ >>> arch/arm/kernel/smp.c | 53 >>> +++++++++++++++++++++++++++++++++++++++-- >>> arch/arm/kernel/topology.c | 17 +++++++++++++ >>> 3 files changed, 75 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/arm/include/asm/topology.h >>> b/arch/arm/include/asm/topology.h >>> index 2fe85ff..4b985dc 100644 >>> --- a/arch/arm/include/asm/topology.h >>> +++ b/arch/arm/include/asm/topology.h >>> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >>> void store_cpu_topology(unsigned int cpuid); >>> const struct cpumask *cpu_coregroup_mask(int cpu); >>> >>> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity >> >> >> >> >> >> What is for this macro ? > > > > > This is used to doesn't add any useless computation in the hot path > when arch_scale_freq_capacity is not used. This has been asked by > peter

What is the difference with having a dummy empty function with a 'weak' attribute (which is how it is done currently in the kernel) ?

You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

the weak function adds the useless sequence: value *= 1024 value >>=10

whereas the macro doesn't add any additional instruction

So if the macro is not defined for the architecture, the compilation will fail.

no there is a test in sched.h

#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif

Ah, ok I didn't see this one.

...

...
I don't see why the function below is not right:

void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;

}

AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler. I had done several tests during the thread discussion and only the chosen solution was able to remove the useless sequence value *= 1024 value >>=10

...
...
...
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Morten Rasmussen

5:32 p.m.

New subject: [RFC internal v2 1/4] arm: Frequency invariant scheduler load-tracking support

On Wed, Apr 29, 2015 at 03:12:38PM +0100, Daniel Lezcano wrote:

...

On 04/29/2015 03:33 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 15:27, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 03:17 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 15:10, Daniel Lezcano daniel.lezcano@linaro.org wrote:

...
On 04/29/2015 02:34 PM, Vincent Guittot wrote:

...
On 29 April 2015 at 14:22, Daniel Lezcano daniel.lezcano@linaro.org wrote: > > > On 04/29/2015 01:15 PM, Vincent Guittot wrote: >> >> >> >> On 29 April 2015 at 12:34, Daniel Lezcano daniel.lezcano@linaro.org >> wrote: >>> >>> >>> >>> On 04/27/2015 09:46 AM, Michael Turquette wrote: >>>> >>>> >>>> >>>> >>>> From: Morten Rasmussen Morten.Rasmussen@arm.com >>>> >>>> Implements arch-specific function to provide the scheduler with a >>>> frequency scaling correction factor for more accurate load-tracking. >>>> The >>>> factor is: >>>> >>>> current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) >>>> >>>> This implementation only provides frequency invariance. No >>>> micro-architecture invariance yet. >>>> >>>> Signed-off-by: Morten Rasmussen morten.rasmussen@arm.com >>>> --- >>>> changes since internal v1: >>>> * replaced two commits from eas v3 with this new one from Morten >>>> >>>> arch/arm/include/asm/topology.h | 7 ++++++ >>>> arch/arm/kernel/smp.c | 53 >>>> +++++++++++++++++++++++++++++++++++++++-- >>>> arch/arm/kernel/topology.c | 17 +++++++++++++ >>>> 3 files changed, 75 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/arch/arm/include/asm/topology.h >>>> b/arch/arm/include/asm/topology.h >>>> index 2fe85ff..4b985dc 100644 >>>> --- a/arch/arm/include/asm/topology.h >>>> +++ b/arch/arm/include/asm/topology.h >>>> @@ -24,6 +24,13 @@ void init_cpu_topology(void); >>>> void store_cpu_topology(unsigned int cpuid); >>>> const struct cpumask *cpu_coregroup_mask(int cpu); >>>> >>>> +#define arch_scale_freq_capacity arm_arch_scale_freq_capacity >>> >>> >>> >>> >>> >>> What is for this macro ? >> >> >> >> >> This is used to doesn't add any useless computation in the hot path >> when arch_scale_freq_capacity is not used. This has been asked by >> peter > > > > > What is the difference with having a dummy empty function with a 'weak' > attribute (which is how it is done currently in the kernel) ?

You can have a look at the thread for the full discussion: https://lkml.org/lkml/2015/3/24/113

Thanks for the pointer. The link seems down for the moment, but I was able to dig in the different folder and find the thread in my mailbox (that would be easier if I have been cc'ed).

It is not clear for the me why the macro is better than 'weak'. It sounds like using the 'weak' attribute is the best thing to do, no ?

the weak function adds the useless sequence: value *= 1024 value >>=10

whereas the macro doesn't add any additional instruction

So if the macro is not defined for the architecture, the compilation will fail.

no there is a test in sched.h

#ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE; } #endif

Ah, ok I didn't see this one.

...
...
I don't see why the function below is not right:

void __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return SCHED_CAPACITY_SCALE;

}

AFAIU, it doesn't ensure that the function will be inlined and optimized by the compiler.

AFAIU, the __weak attribute guarantees that the function cannot be inlined at all. The __weak symbols are resolved at link time, not at compile time, hence they cannot be inlined. So Daniel's example above, which is pretty much what we used to have prior to 4.1, will always lead to a function call. In addition to that, the compiler has no chance of figuring out that it always returns a constant which is paired with a shift or division in every call site in fair.c which could all have been optimized out.

The _weak trick was fine while we didn't call arch_scale_freq_capacity() very often. Now that we call it all the time, we have to minimize the overhead and the #define trick Peter came up with seems to be the only feasible way to get zero overhead when the architecture doesn't care about frequency scaling of capacity.

Morten

Michael Turquette

27 Apr 27 Apr

7:46 a.m.

New subject: [RFC internal v2 2/4] sched: sched feature for cpu frequency selection

This patch introduces the SCHED_ENERGY_FREQ sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set to false when SCHED_DEBUG is not defined and thus disabled by default.

Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: none

kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 6 ++++++ 2 files changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46855d0..75aec8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4207,6 +4207,11 @@ static inline void hrtick_update(struct rq *rq) } #endif

+static inline bool sched_energy_freq(void) +{ + return sched_feat(SCHED_ENERGY_FREQ); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 91e33cd..77381cf 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) */ SCHED_FEAT(NUMA_RESIST_LOWER, false) #endif + +/* + * Scheduler-driven CPU frequency selection aimed to save energy based on + * load tracking + */ +SCHED_FEAT(SCHED_ENERGY_FREQ, false) -- 1.9.1

Michael Turquette

7:46 a.m.

New subject: [RFC internal v2 3/4] sched: export get_cpu_usage & capacity_orig_of

get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.

Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: * exported capacity_orig_of

kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }

-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done: * Without capping the usage, a group could be seen as overloaded (CPU0 usage * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity */ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

Daniel Lezcano

29 Apr 29 Apr

12:23 p.m.

New subject: [RFC internal v2 3/4] sched: export get_cpu_usage & capacity_orig_of

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...

get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

exported capacity_orig_of

kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }

-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu);

This one could be changed to a static inline in the header file, no ?

...

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Michael Turquette

10:35 p.m.

New subject: [RFC internal v2 3/4] sched: export get_cpu_usage & capacity_orig_of

Quoting Daniel Lezcano (2015-04-29 05:23:18)

...

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
get_cpu_usage and capacity_orig_of are useful for a cpu frequency scaling policy which is based on cfs load tracking and cpu capacity metrics. Expose these calls in sched.h so that they can be used in such a policy.

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

exported capacity_orig_of

kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 75aec8d..393fc36 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4366,7 +4366,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; }

-static unsigned long capacity_orig_of(int cpu) +unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } @@ -4801,7 +4801,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -static int get_cpu_usage(int cpu) +int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e1299..63a8be9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,6 +1396,9 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

+int get_cpu_usage(int cpu); +unsigned long capacity_orig_of(int cpu);

This one could be changed to a static inline in the header file, no ?

Yes it could. I'll update the patch.

Regards, Mike

...

...

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Michael Turquette

27 Apr 27 Apr

7:46 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

1) re-using the cpufreq machine drivers without using the governor interface is hard.

2) using the cpufreq interface allows us to switch between the scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org --- changes since internal v1: * renamed everything * fixed possible deadlock between gov_cfs_thread and gov_cfs_stop * replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges * usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases * dropped workqueue method due to instability * kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED_CFS + bool "sched_cfs" + select CPU_FREQ_GOV_SCHED_CFS + select CPU_FREQ_GOV_PERFORMANCE + help + Use the CPUfreq governor 'sched_cfs' as default. This scales + cpu frequency from the scheduler as per-entity load tracking + statistics are updated. endchoice

config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE

If in doubt, say N.

+config CPU_FREQ_GOV_SCHED_CFS + tristate "'sched cfs' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched_cfs' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is invoked from the completely fair + scheduler when updating per-entity load tracking statistics. + Latency to respond to changes in load is improved over polling + governors due to its event-driven design. + + If in doubt, say N. + comment "CPU frequency scaling drivers"

config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/* + * Copyright (C) 2015 Michael Turquette mturquette@linaro.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h> + +#include "sched.h" + +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * + * struct gov_data is the per-policy gov_cfs-specific data structure. A + * per-policy instance of it is created when the gov_cfs governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + struct cpufreq_policy *policy; +}; + +/** + * gov_cfs_select_freq - pick the next frequency for a cpu + * @policy: the cpufreq policy whose frequency may be changed + * + * gov_cfs_select_freq selects a frequency based on pelt load statistics + * tracked by cfs. First it finds the most utilized cpu in the policy and then + * maps that utilization value onto a cpu frequency and returns it. + * + * Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value + * before converting it to a frequency. The margin is derived from MARGIN_PCT, + * which itself is inspired by imbalance_pct in cfs. This is needed to + * proactively increase frequency in the case of increasing load. + * + * This approach attempts to maintain headroom of 25% unutilized cpu capacity. + * A traditional way of doing this is to take 75% of the current capacity and + * check if current utilization crosses that threshold. The only problem with + * that approach is determining the next cpu frequency target if that threshold + * is crossed. + * + * Instead of using the 75% threshold, gov_cfs_select_freq adds a 25% + * utilization margin to the utilization and converts that to a frequency. This + * removes conditional logic around checking thresholds and better supports + * drivers that use non-discretized frequency ranges (i.e. no pre-defined + * frequency tables or operating points). + * + * Returns frequency selected. + */ +static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{ + int cpu = 0; + struct gov_data *gd; + unsigned long freq = 0, max_usage = 0, usage = 0; + + if (!policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* + * get_cpu_usage is called without locking the runqueues. This is the + * same behavior used by find_busiest_cpu in load_balance. We are + * willing to accept occasionally stale data here in exchange for + * lockless behavior. + */ + for_each_cpu(cpu, policy->cpus) { + usage = get_cpu_usage(cpu); + if (usage > max_usage) + max_usage = usage; + } + + /* add margin to max_usage based on imbalance_pct */ + max_usage = max_usage * MARGIN_PCT / 100; + + cpu = cpumask_first(policy->cpus); + + /* freq is current utilization + 25% */ + freq = max_usage * policy->max / capacity_orig_of(cpu); + +out: + return freq; +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int gov_cfs_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long freq; + int ret; + + policy = (struct cpufreq_policy *) data; + if (!policy) { + pr_warn("%s: missing policy\n", __func__); + do_exit(-EINVAL); + } + + gd = policy->governor_data; + if (!gd) { + pr_warn("%s: missing governor data\n", __func__); + do_exit(-EINVAL); + } + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus); + if (ret) { + pr_warn("%s: failed to set allowed ptr\n", __func__); + do_exit(-EINVAL); + } + + /* main loop of the per-policy kthread */ + do { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (kthread_should_stop()) + break; + + /* avoid race with gov_cfs_stop */ + if (!down_write_trylock(&policy->rwsem)) + continue; + + freq = gov_cfs_select_freq(policy); + + ret = __cpufreq_driver_target(policy, freq, + CPUFREQ_RELATION_H); + if (ret) + pr_debug("%s: __cpufreq_driver_target returned %d\n", + __func__, ret); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); + } while (!kthread_should_stop()); + + do_exit(0); +} + +static void gov_cfs_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) { + return; + } + + wake_up_process(gd->task); +} + +/** + * gov_cfs_update_cpu - interface to scheduler for changing capacity values + * @cpu: cpu whose capacity utilization has recently changed + * + * gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the + * scheduler may inform the governor of updates to capacity utilization and + * make changes to cpu frequency. Currently this interface is designed around + * PELT values in CFS. It can be expanded to other scheduling classes in the + * future if needed. + * + * gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up + * the thread that does the actual work, gov_cfs_thread. + */ +void gov_cfs_update_cpu(int cpu) +{ + struct cpufreq_policy *policy; + struct gov_data *gd; + + /* XXX put policy pointer in per-cpu data? */ + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) { + return; + } + + if (!policy->governor_data) { + goto out; + } + + gd = policy->governor_data; + + /* bail early if we are throttled */ + if (ktime_before(ktime_get(), gd->throttle)) { + goto out; + } + + irq_work_queue_on(&gd->irq_work, cpu); + +out: + cpufreq_cpu_put(policy); + return; +} + +static void gov_cfs_start(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + /* prepare per-policy private data */ + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) { + pr_debug("%s: failed to allocate private data\n", __func__); + return; + } + + /* + * Don't ask for freq changes at an higher rate than what + * the driver advertises as transition latency. + */ + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + /* init per-policy kthread */ + gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); + if (IS_ERR_OR_NULL(gd->task)) + pr_err("%s: failed to create kgov_cfs_task thread\n", __func__); + + init_irq_work(&gd->irq_work, gov_cfs_irq_work); + policy->governor_data = gd; + gd->policy = policy; +} + +static void gov_cfs_stop(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + + gd = policy->governor_data; + kthread_stop(gd->task); + + policy->governor_data = NULL; + + /* FIXME replace with devm counterparts? */ + kfree(gd); +} + +static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_START: + /* Start managing the frequency */ + gov_cfs_start(policy); + return 0; + + case CPUFREQ_GOV_STOP: + gov_cfs_stop(policy); + return 0; + + case CPUFREQ_GOV_LIMITS: /* unused */ + case CPUFREQ_GOV_POLICY_INIT: /* unused */ + case CPUFREQ_GOV_POLICY_EXIT: /* unused */ + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = { + .name = "gov_cfs", + .governor = gov_cfs_setup, + .owner = THIS_MODULE, +}; + +static int __init gov_cfs_init(void) +{ + return cpufreq_register_governor(&cpufreq_gov_cfs); +} + +static void __exit gov_cfs_exit(void) +{ + cpufreq_unregister_governor(&cpufreq_gov_cfs); +} + +/* Try to make this the default governor */ +fs_initcall(gov_cfs_init); + +MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); } + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); + hrtick_update(rq); }

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); + hrtick_update(rq); }

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);

update_rq_runnable_avg(rq, 1); + + if(sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(rq)); }

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

Juri Lelli

5:09 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Mike,

On 27/04/15 08:46, Michael Turquette wrote:

...

Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is

requesting ^

...

equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
  bool "sched_cfs"
  select CPU_FREQ_GOV_SCHED_CFS
  select CPU_FREQ_GOV_PERFORMANCE
  help
    Use the CPUfreq governor 'sched_cfs' as default. This scales
    cpu frequency from the scheduler as per-entity load tracking
    statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
      If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
  tristate "'sched cfs' cpufreq governor"
  depends on CPU_FREQ

Also CONFIG_IRQ_WORK is a dependency.

...

  select CPU_FREQ_GOV_COMMON
  help
    'sched_cfs' - this governor scales cpu frequency from the
    scheduler as a function of cpu capacity utilization. It does
    not evaluate utilization on a periodic basis (as ondemand
    does) but instead is invoked from the completely fair
    scheduler when updating per-entity load tracking statistics.
    Latency to respond to changes in load is improved over polling
    governors due to its event-driven design.
    If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h>

We don't need this anymore (at least for now), right?

...

+#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {
  ktime_t throttle;
  unsigned int throttle_nsec;
  struct task_struct *task;
  struct irq_work irq_work;
  struct cpufreq_policy *policy;
+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.

utilization? ^

...

- This approach attempts to maintain headroom of 25% unutilized cpu capacity.
- A traditional way of doing this is to take 75% of the current capacity and
- check if current utilization crosses that threshold. The only problem with
- that approach is determining the next cpu frequency target if that threshold
- is crossed.
- Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%
- utilization margin to the utilization and converts that to a frequency. This
- removes conditional logic around checking thresholds and better supports
- drivers that use non-discretized frequency ranges (i.e. no pre-defined
- frequency tables or operating points).
- Returns frequency selected.
*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{

```
  int cpu = 0;
```
```
  struct gov_data *gd;
```

  unsigned long freq = 0, max_usage = 0, usage = 0;

```
  if (!policy->governor_data)
```
```
          goto out;
```
```
  gd = policy->governor_data;
```
```
  /*
```

   * get_cpu_usage is called without locking the runqueues. This is the

   * same behavior used by find_busiest_cpu in load_balance. We are

   * willing to accept occasionally stale data here in exchange for

```
   * lockless behavior.
```
```
   */
```
```
  for_each_cpu(cpu, policy->cpus) {
```
```
          usage = get_cpu_usage(cpu);
```
```
          if (usage > max_usage)
```
```
                  max_usage = usage;
```
```
  }
```

  /* add margin to max_usage based on imbalance_pct */

  max_usage = max_usage * MARGIN_PCT / 100;

```
  cpu = cpumask_first(policy->cpus);
```

  /* freq is current utilization + 25% */

  freq = max_usage * policy->max / capacity_orig_of(cpu);

+out:

```
  return freq;
```

+/*

- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
*/

+static int gov_cfs_thread(void *data) +{

```
  struct sched_param param;
```
```
  struct cpufreq_policy *policy;
```
```
  struct gov_data *gd;
```
```
  unsigned long freq;
```
```
  int ret;
```

  policy = (struct cpufreq_policy *) data;

```
  if (!policy) {
```

          pr_warn("%s: missing policy\n", __func__);

```
          do_exit(-EINVAL);
```
```
  }
```
```
  gd = policy->governor_data;
```
```
  if (!gd) {
```

          pr_warn("%s: missing governor data\n", __func__);

```
          do_exit(-EINVAL);
```
```
  }
```
```
  param.sched_priority = 50;
```

  ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);

```
  if (ret) {
```

          pr_warn("%s: failed to set SCHED_FIFO\n", __func__);

```
          do_exit(-EINVAL);
```
```
  } else {
```

          pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",

                          __func__, gd->task->pid);

```
  }
```

  ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);

```
  if (ret) {
```

          pr_warn("%s: failed to set allowed ptr\n", __func__);

```
          do_exit(-EINVAL);
```
```
  }
```

  /* main loop of the per-policy kthread */

```
  do {
```

          set_current_state(TASK_INTERRUPTIBLE);

```
          schedule();
```
```
          if (kthread_should_stop())
```
```
                  break;
```

          /* avoid race with gov_cfs_stop */

          if (!down_write_trylock(&policy->rwsem))

```
                  continue;
```

          freq = gov_cfs_select_freq(policy);

          ret = __cpufreq_driver_target(policy, freq,

                          CPUFREQ_RELATION_H);

I think we should use CPUFREQ_RELATION_L here. From the comments I read:

#define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */

So we have to tell the driver to select a frequency with enough capacity (above the current one).

...

```
          if (ret)
```

                  pr_debug("%s: __cpufreq_driver_target returned %d\n",

                                  __func__, ret);

          gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);

```
          up_write(&policy->rwsem);
```
```
  } while (!kthread_should_stop());
```
```
  do_exit(0);
```

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{

```
  struct gov_data *gd;
```

  gd = container_of(irq_work, struct gov_data, irq_work);

```
  if (!gd) {
```
```
          return;
```
```
  }
```

No brackets?

...

```
  wake_up_process(gd->task);
```

So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.

...

+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  /* XXX put policy pointer in per-cpu data? */
  policy = cpufreq_cpu_get(cpu);
  if (IS_ERR_OR_NULL(policy)) {
          return;
  }
  if (!policy->governor_data) {
          goto out;
  }
  gd = policy->governor_data;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }

No brackets in the 3 ifs above?

Thanks,

- Juri

...

  irq_work_queue_on(&gd->irq_work, cpu);
+out:
  cpufreq_cpu_put(policy);
  return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
  if (!gd) {
          pr_debug("%s: failed to allocate private data\n", __func__);
          return;
  }
  /*
   * Don't ask for freq changes at an higher rate than what
   * the driver advertises as transition latency.
   */
  gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                      policy->cpuinfo.transition_latency :
                      THROTTLE_NSEC;
  pr_debug("%s: throttle threshold = %u [ns]\n",
            __func__, gd->throttle_nsec);
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
          pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
  init_irq_work(&gd->irq_work, gov_cfs_irq_work);
  policy->governor_data = gd;
  gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  gd = policy->governor_data;
  kthread_stop(gd->task);
  policy->governor_data = NULL;
  /* FIXME replace with devm counterparts? */
  kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
  switch (event) {
          case CPUFREQ_GOV_START:
                  /* Start managing the frequency */
                  gov_cfs_start(policy);
                  return 0;
          case CPUFREQ_GOV_STOP:
                  gov_cfs_stop(policy);
                  return 0;
          case CPUFREQ_GOV_LIMITS:        /* unused */
          case CPUFREQ_GOV_POLICY_INIT:   /* unused */
          case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                  break;
  }
  return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
  .name                   = "gov_cfs",
  .governor               = gov_cfs_setup,
  .owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
  return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
  cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Michael Turquette

28 Apr 28 Apr

4:02 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Juri Lelli (2015-04-27 10:09:50)

...

Hi Mike,

On 27/04/15 08:46, Michael Turquette wrote:

...
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is
                               requesting ^
...
equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
  bool "sched_cfs"
  select CPU_FREQ_GOV_SCHED_CFS
  select CPU_FREQ_GOV_PERFORMANCE
  help
    Use the CPUfreq governor 'sched_cfs' as default. This scales
    cpu frequency from the scheduler as per-entity load tracking
    statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
      If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
  tristate "'sched cfs' cpufreq governor"
  depends on CPU_FREQ
Also CONFIG_IRQ_WORK is a dependency.

...
  select CPU_FREQ_GOV_COMMON
  help
    'sched_cfs' - this governor scales cpu frequency from the
    scheduler as a function of cpu capacity utilization. It does
    not evaluate utilization on a periodic basis (as ondemand
    does) but instead is invoked from the completely fair
    scheduler when updating per-entity load tracking statistics.
    Latency to respond to changes in load is improved over polling
    governors due to its event-driven design.
    If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h>
We don't need this anymore (at least for now), right?

...
+#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {
  ktime_t throttle;
  unsigned int throttle_nsec;
  struct task_struct *task;
  struct irq_work irq_work;
  struct cpufreq_policy *policy;
+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.
                                                utilization? ^
...
This approach attempts to maintain headroom of 25% unutilized cpu capacity.

A traditional way of doing this is to take 75% of the current capacity and

check if current utilization crosses that threshold. The only problem with

that approach is determining the next cpu frequency target if that threshold

is crossed.

Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%

utilization margin to the utilization and converts that to a frequency. This

removes conditional logic around checking thresholds and better supports

drivers that use non-discretized frequency ranges (i.e. no pre-defined

frequency tables or operating points).

Returns frequency selected.

*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
  int cpu = 0;
  struct gov_data *gd;
  unsigned long freq = 0, max_usage = 0, usage = 0;
  if (!policy->governor_data)
          goto out;
  gd = policy->governor_data;
  /*
   * get_cpu_usage is called without locking the runqueues. This is the
   * same behavior used by find_busiest_cpu in load_balance. We are
   * willing to accept occasionally stale data here in exchange for
   * lockless behavior.
   */
  for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
  /* add margin to max_usage based on imbalance_pct */
  max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
  return freq;
+}

+/*

we pass in struct cpufreq_policy. This is safe because changing out the

policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),

which tears down all of the data structures and __cpufreq_governor(policy,

CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the

new policy pointer

*/

+static int gov_cfs_thread(void *data) +{
  struct sched_param param;
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  unsigned long freq;
  int ret;
  policy = (struct cpufreq_policy *) data;
  if (!policy) {
          pr_warn("%s: missing policy\n", __func__);
          do_exit(-EINVAL);
  }
  gd = policy->governor_data;
  if (!gd) {
          pr_warn("%s: missing governor data\n", __func__);
          do_exit(-EINVAL);
  }
  param.sched_priority = 50;
  ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
  if (ret) {
          pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
          do_exit(-EINVAL);
  } else {
          pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
                          __func__, gd->task->pid);
  }
  ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
  if (ret) {
          pr_warn("%s: failed to set allowed ptr\n", __func__);
          do_exit(-EINVAL);
  }
  /* main loop of the per-policy kthread */
  do {
          set_current_state(TASK_INTERRUPTIBLE);
          schedule();
          if (kthread_should_stop())
                  break;
          /* avoid race with gov_cfs_stop */
          if (!down_write_trylock(&policy->rwsem))
                  continue;
          freq = gov_cfs_select_freq(policy);
          ret = __cpufreq_driver_target(policy, freq,
                          CPUFREQ_RELATION_H);
I think we should use CPUFREQ_RELATION_L here. From the comments I read:

#define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */

So we have to tell the driver to select a frequency with enough capacity (above the current one).

Thanks for all of the above comments. I'll fix all of them. this RELATION_L thing is annoying because I had it in my notes to fix it and forgot :-/

...

```
          if (ret)
```

                  pr_debug("%s: __cpufreq_driver_target returned %d\n",

                                  __func__, ret);

          gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);

```
          up_write(&policy->rwsem);
```
```
  } while (!kthread_should_stop());
```
```
  do_exit(0);
```

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{

```
  struct gov_data *gd;
```

  gd = container_of(irq_work, struct gov_data, irq_work);

```
  if (!gd) {
```
```
          return;
```
```
  }
```

No brackets?

Will fix.

...

...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.

This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".

So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

...

...
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  /* XXX put policy pointer in per-cpu data? */
  policy = cpufreq_cpu_get(cpu);
  if (IS_ERR_OR_NULL(policy)) {
          return;
  }
  if (!policy->governor_data) {
          goto out;
  }
  gd = policy->governor_data;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
No brackets in the 3 ifs above?

Will fix.

Thanks, Mike

...

Thanks,

Juri

...
  irq_work_queue_on(&gd->irq_work, cpu);
+out:
  cpufreq_cpu_put(policy);
  return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
  if (!gd) {
          pr_debug("%s: failed to allocate private data\n", __func__);
          return;
  }
  /*
   * Don't ask for freq changes at an higher rate than what
   * the driver advertises as transition latency.
   */
  gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                      policy->cpuinfo.transition_latency :
                      THROTTLE_NSEC;
  pr_debug("%s: throttle threshold = %u [ns]\n",
            __func__, gd->throttle_nsec);
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
          pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
  init_irq_work(&gd->irq_work, gov_cfs_irq_work);
  policy->governor_data = gd;
  gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  gd = policy->governor_data;
  kthread_stop(gd->task);
  policy->governor_data = NULL;
  /* FIXME replace with devm counterparts? */
  kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
  switch (event) {
          case CPUFREQ_GOV_START:
                  /* Start managing the frequency */
                  gov_cfs_start(policy);
                  return 0;
          case CPUFREQ_GOV_STOP:
                  gov_cfs_stop(policy);
                  return 0;
          case CPUFREQ_GOV_LIMITS:        /* unused */
          case CPUFREQ_GOV_POLICY_INIT:   /* unused */
          case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                  break;
  }
  return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
  .name                   = "gov_cfs",
  .governor               = gov_cfs_setup,
  .owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
  return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
  cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Juri Lelli

5:48 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...

Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...

...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".

Can you make an example of such systems?

...

So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

You mean because in this case we don't have any reference to base such a threshold on?

...

An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.

Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.

With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).

Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.

...

From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001

From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API

Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.

A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.

Signed-off-by: Juri Lelli juri.lelli@arm.com --- kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@

#include "sched.h"

-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */

+static DEFINE_PER_CPU(unsigned long, new_capacity); + /** * gov_data - per-policy data internal to the governor * @throttle: next throttling period expiry. Derived from throttle_nsec @@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) { - usage = get_cpu_usage(cpu); + usage = per_cpu(new_capacity, cpu); if (usage > max_usage) max_usage = usage; } @@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;

- cpu = cpumask_first(policy->cpus); - - if (max_usage >= capacity_orig_of(cpu)) { + if (max_usage >= SCHED_LOAD_SCALE) { freq = policy->max; goto out; }

/* freq is current utilization + 25% */ - freq = max_usage * policy->max / capacity_orig_of(cpu); + freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;

out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) * gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up * the thread that does the actual work, gov_cfs_thread. */ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }

+ per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu);

out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd; + int cpu;

/* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL); @@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);

+ for_each_cpu(cpu, policy->related_cpus) + per_cpu(new_capacity, cpu) = 0; + /* init per-policy kthread */ gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task"); if (IS_ERR_OR_NULL(gd->task)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }

if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

hrtick_update(rq); } @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }

if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

hrtick_update(rq); } @@ -4800,6 +4800,12 @@ next: done: return target; } + +unsigned long capacity_curr_of(int cpu) +{ + return arch_scale_freq_capacity(NULL, cpu); +} + /* * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -4817,7 +4823,7 @@ done: * Without capping the usage, a group could be seen as overloaded (CPU0 usage * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity */ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

+static inline unsigned long task_utilization(struct task_struct *p) +{ + return p->se.avg.utilization_avg_contrib; +} + /* * scheduler tick hitting a task of our scheduling class: */ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + int cpu = task_cpu(curr);

for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

update_rq_runnable_avg(rq, 1);

- if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq)); + if (sched_energy_freq() && + (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) && + ((capacity_curr_of(cpu) * 100) < + (task_utilization(curr) * MARGIN_PCT))) { + gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE); + } }

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);

#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {} #endif

-- 2.2.2 I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization. >From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points - remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity - modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API) Signed-off-by: Juri Lelli juri.lelli@arm.com --- kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); } +void gov_cfs_reset_cpu(int cpu) +{ + per_cpu(new_capacity, cpu) = 0; +} + /** * gov_cfs_update_cpu - interface to scheduler for changing capacity values * @cpu: cpu whose capacity utilization has recently changed diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); } - if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); - hrtick_update(rq); } @@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); } - if(sched_energy_freq()) - gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg); + if(sched_energy_freq()) { + /* + * Ask for an update only if we are not going idle. + * If we are going idle we just need to clear our + * current request. + */ + if (rq->cfs.nr_running) + gov_cfs_update_cpu(cpu_of(rq), + rq->cfs.utilization_load_avg); + else + gov_cfs_reset_cpu(cpu_of(rq)); + } hrtick_update(rq); } @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; } +static inline unsigned long task_utilization(struct task_struct *p) +{ + return p->se.avg.utilization_avg_contrib; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock(); + /* We want to consider the pre-decayed utilization */ + if(sched_energy_freq()) + gov_cfs_update_cpu(new_cpu, + cpu_rq(new_cpu)->cfs.utilization_load_avg + + task_utilization(p)); return new_cpu; } @@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ -static inline unsigned long task_utilization(struct task_struct *p) -{ - return p->se.avg.utilization_avg_contrib; -} - /* * scheduler tick hitting a task of our scheduling class: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {} #endif -- 2.2.2 This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4). Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already). >From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled If the kthread is throttled we still need to update requests, or we may end-up with stale values. Signed-off-by: Juri Lelli juri.lelli@arm.com --- kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity) gd = policy->governor_data; + per_cpu(new_capacity, cpu) = capacity; + /* bail early if we are throttled */ if (ktime_before(ktime_get(), gd->throttle)) { goto out; } - per_cpu(new_capacity, cpu) = capacity; irq_work_queue_on(&gd->irq_work, cpu); out: -- 2.2.2 >From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing This should cover load_balance paths (untested). Signed-off-by: Juri Lelli juri.lelli@arm.com --- kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned: ld_moved = 0; out: + /* dst_grpmask might be NULL for NEWLY_IDLE. */ + if (sched_energy_freq() && ld_moved && env.dst_grpmask) + /* + * dequeue_task_fair() already took care of src_cpu + */ + gov_cfs_update_cpu(env.dst_cpu, + cpu_rq(env.dst_cpu)->cfs.utilization_load_avg); + return ld_moved; } @@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock); - if (p) + if (p) { attach_one_task(target_rq, p); + if (sched_energy_freq()) + gov_cfs_update_cpu(cpu_of(target_rq), + target_rq->cfs.utilization_load_avg); + } local_irq_enable(); -- 2.2.2 Comments? Thanks, - Juri

Vincent Guittot

29 Apr 29 Apr

7:52 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 28 April 2015 at 19:48, Juri Lelli juri.lelli@arm.com wrote:

...

Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?

...
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

You mean because in this case we don't have any reference to base such a threshold on?

...
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.

Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.

With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).

Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.

From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API

Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.

A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@

#include "sched.h"

-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */

+static DEFINE_PER_CPU(unsigned long, new_capacity);

/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          usage = per_cpu(new_capacity, cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  if (max_usage >= capacity_orig_of(cpu)) {
  if (max_usage >= SCHED_LOAD_SCALE) {
          freq = policy->max;
          goto out;
  }

  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
  freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
  int cpu;

  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
  for_each_cpu(cpu, policy->related_cpus)
          per_cpu(new_capacity, cpu) = 0;
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

Using utilization_load_avg is an interesting way to remove dependency with CPU's capacity

...

    hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

  hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }

+unsigned long capacity_curr_of(int cpu) +{
  return arch_scale_freq_capacity(NULL, cpu);
+}

/*

get_cpu_usage returns the amount of capacity of a CPU that is used by CFS

tasks. The unit of the return value must be the one of capacity so we can

@@ -4817,7 +4823,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

scheduler tick hitting a task of our scheduling class:

*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
  int cpu = task_cpu(curr);

  for_each_sched_entity(se) {
          cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  if (sched_energy_freq() &&
      (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
      ((capacity_curr_of(cpu) * 100) <
       (task_utilization(curr) * MARGIN_PCT))) {
          gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);

This looks like a policy (similar to ondemand one) and i'm not sure that we should such policy in the tick

...

  }
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);

#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.

From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points

remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity

modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }

+void gov_cfs_reset_cpu(int cpu) +{
  per_cpu(new_capacity, cpu) = 0;
+}

/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  hrtick_update(rq);
}

@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  if(sched_energy_freq()) {
          /*
           * Ask for an update only if we are not going idle.
           * If we are going idle we just need to clear our
           * current request.
           */
          if (rq->cfs.nr_running)
                  gov_cfs_update_cpu(cpu_of(rq),
                          rq->cfs.utilization_load_avg);
          else
                  gov_cfs_reset_cpu(cpu_of(rq));
  }

  hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

select_task_rq_fair: Select target runqueue for the waking task in domains

that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
  /* We want to consider the pre-decayed utilization */
  if(sched_energy_freq())
          gov_cfs_update_cpu(new_cpu,
                             cpu_rq(new_cpu)->cfs.utilization_load_avg +
                             task_utilization(p));

This can be seen as an artifact to boost the frequency as you use an old value Then, you are not sure that the scheduler will select this cpu at the end (because of cpu affinity as an example); The only place is the enqueue function

...

    return new_cpu;
}

@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

-static inline unsigned long task_utilization(struct task_struct *p) -{
  return p->se.avg.utilization_avg_contrib;
-}

/*

scheduler tick hitting a task of our scheduling class:

*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).

Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).

From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled

If the kthread is throttled we still need to update requests, or we may end-up with stale values.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
    gd = policy->governor_data;
  per_cpu(new_capacity, cpu) = capacity;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out:

2.2.2

From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing

This should cover load_balance paths (untested).

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
    ld_moved = 0;
out:
  /* dst_grpmask might be NULL for NEWLY_IDLE. */
  if (sched_energy_freq() && ld_moved && env.dst_grpmask)
          /*
           * dequeue_task_fair() already took care of src_cpu
           */
          gov_cfs_update_cpu(env.dst_cpu,
                  cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);

dst_cpu is the last cpu that has been used for load balancing but more cpus can have been involved so you can miss some tasks migration: enqueue/dequeue are the only safe place

...

    return ld_moved;

}

@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);

```
  if (p)
```

  if (p) {
          attach_one_task(target_rq, p);

```
          if (sched_energy_freq())
```

                  gov_cfs_update_cpu(cpu_of(target_rq),

                          target_rq->cfs.utilization_load_avg);

```
  }

  local_irq_enable();
```

-- 2.2.2

Comments?

multiplying the hook increases the complexity and risk to have regression each time there is a change in the scheduler. So staying in enqueue/dequeue is more safe as it is the only place that ensure that a task will be pu on a rq

...

Thanks,

Juri

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Juri Lelli

10 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Vincent,

thanks for your review.

On 29/04/15 08:52, Vincent Guittot wrote:

...

On 28 April 2015 at 19:48, Juri Lelli juri.lelli@arm.com wrote:

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?

...
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

You mean because in this case we don't have any reference to base such a threshold on?

...
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.

Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.

With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).

Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.

From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API

Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.

A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@

#include "sched.h"

-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */

+static DEFINE_PER_CPU(unsigned long, new_capacity);

/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          usage = per_cpu(new_capacity, cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  if (max_usage >= capacity_orig_of(cpu)) {
  if (max_usage >= SCHED_LOAD_SCALE) {
          freq = policy->max;
          goto out;
  }

  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
  freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
  int cpu;

  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
  for_each_cpu(cpu, policy->related_cpus)
          per_cpu(new_capacity, cpu) = 0;
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
Using utilization_load_avg is an interesting way to remove dependency with CPU's capacity

...
    hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

  hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }

+unsigned long capacity_curr_of(int cpu) +{
  return arch_scale_freq_capacity(NULL, cpu);
+}

/*

get_cpu_usage returns the amount of capacity of a CPU that is used by CFS

tasks. The unit of the return value must be the one of capacity so we can

@@ -4817,7 +4823,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

scheduler tick hitting a task of our scheduling class:

*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
  int cpu = task_cpu(curr);

  for_each_sched_entity(se) {
          cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  if (sched_energy_freq() &&
      (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
      ((capacity_curr_of(cpu) * 100) <
       (task_utilization(curr) * MARGIN_PCT))) {
          gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
This looks like a policy (similar to ondemand one) and i'm not sure that we should such policy in the tick

The rationale behind this is that if we base all our decision on utilization signal we only get a "true" signal when we don't saturate the current capacity. Here we realized that we are most likely going to saturate that, so we react going to max. When the task will be subsequently dequeue we probably have better chances to pick the "true" capacity level (when this task is queued back for example).

...

...
  }
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);

#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.

From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points

remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity

modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }

+void gov_cfs_reset_cpu(int cpu) +{
  per_cpu(new_capacity, cpu) = 0;
+}

/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  hrtick_update(rq);
}

@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  if(sched_energy_freq()) {
          /*
           * Ask for an update only if we are not going idle.
           * If we are going idle we just need to clear our
           * current request.
           */
          if (rq->cfs.nr_running)
                  gov_cfs_update_cpu(cpu_of(rq),
                          rq->cfs.utilization_load_avg);
          else
                  gov_cfs_reset_cpu(cpu_of(rq));
  }

  hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

select_task_rq_fair: Select target runqueue for the waking task in domains

that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
  /* We want to consider the pre-decayed utilization */
  if(sched_energy_freq())
          gov_cfs_update_cpu(new_cpu,
                             cpu_rq(new_cpu)->cfs.utilization_load_avg +
                             task_utilization(p));
This can be seen as an artifact to boost the frequency as you use an old value

Well, as the old value is the only thing I know about the task (we don't already have blocked_utilization here) I was trying to avoid kicking the thing in the tick right after the task starts running, but rather trying to predict what will probably happen.

...

Then, you are not sure that the scheduler will select this cpu at the end (because of cpu affinity as an example); The only place is the enqueue function

So, select_task_rq_fair() is not even called for tasks with a single cpu affinity mask, we'll have to put a check in select_task_rq(), right. But after this we should be pretty sure on where the task will be woken up.

...

...
    return new_cpu;
}

@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

-static inline unsigned long task_utilization(struct task_struct *p) -{
  return p->se.avg.utilization_avg_contrib;
-}

/*

scheduler tick hitting a task of our scheduling class:

*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).

Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).

From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled

If the kthread is throttled we still need to update requests, or we may end-up with stale values.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
    gd = policy->governor_data;
  per_cpu(new_capacity, cpu) = capacity;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out:

2.2.2

From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing

This should cover load_balance paths (untested).

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
    ld_moved = 0;
out:
  /* dst_grpmask might be NULL for NEWLY_IDLE. */
  if (sched_energy_freq() && ld_moved && env.dst_grpmask)
          /*
           * dequeue_task_fair() already took care of src_cpu
           */
          gov_cfs_update_cpu(env.dst_cpu,
                  cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
dst_cpu is the last cpu that has been used for load balancing but more cpus can have been involved so you can miss some tasks migration: enqueue/dequeue are the only safe place

Right, for the new_dst_cpu thing. I guess we have to move this trigger up above (probably inside the more_balance loop).

...

...
    return ld_moved;
}

@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
  if (p)
  if (p) {
          attach_one_task(target_rq, p);
          if (sched_energy_freq())
                  gov_cfs_update_cpu(cpu_of(target_rq),
                          target_rq->cfs.utilization_load_avg);
  }

  local_irq_enable();
-- 2.2.2

Comments?
multiplying the hook increases the complexity and risk to have regression each time there is a change in the scheduler. So staying in enqueue/dequeue is more safe as it is the only place that ensure that a task will be pu on a rq

So, I actually started thinking and doing exactly what you are saying, but then Peter's reply came on LKML and a started thinking how (and if) we can actually do something finer grained. I still agree that hooks in enqueue/dequeue/tick (maybe even in core) make all this a lot easier, but at the same time we are risking to add useless overhead or even missing potentially needed freq changes (think for example if we move a lot of task between rqs during a load balance iteration).

Best,

- Juri

Michael Turquette

8:32 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Juri Lelli (2015-04-28 10:48:27)

...

Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?

CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!

So I decided to transmute the threshold into a margin. Instead of checking to see if we crossed some boundary we always try to maintain a bit of overhead. This works for table-based and table-less systems, and allows us to hit the minimum and maximum frequencies without any weird corner cases.

...

...
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

You mean because in this case we don't have any reference to base such a threshold on?

Correct.

...

...
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.

Yeah, but it is still hidden behind the rwsem. I'd to get rid of that too if I can think of a way.

...

Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.

With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).

Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.

Thanks for the patches. I'll review them and get back to you but I wanted to respond to your questions above asap.

Regards, Mike

...

From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API

Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.

A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@ #include "sched.h" -#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */ +static DEFINE_PER_CPU(unsigned long, new_capacity);

/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          usage = per_cpu(new_capacity, cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  if (max_usage >= capacity_orig_of(cpu)) {
  if (max_usage >= SCHED_LOAD_SCALE) {
          freq = policy->max;
          goto out;
  }
/* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
  freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
  int cpu;
/* prepare per-policy private data */ gd = kzalloc(sizeof(*gd), GFP_KERNEL); @@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
  for_each_cpu(cpu, policy->related_cpus)
          per_cpu(new_capacity, cpu) = 0;
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
hrtick_update(rq); } @@ -4800,6 +4800,12 @@ next: done: return target; }

+unsigned long capacity_curr_of(int cpu) +{
  return arch_scale_freq_capacity(NULL, cpu);
+}

/*

get_cpu_usage returns the amount of capacity of a CPU that is used by CFS

tasks. The unit of the return value must be the one of capacity so we can

@@ -4817,7 +4823,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ +static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

scheduler tick hitting a task of our scheduling class:

*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
  int cpu = task_cpu(curr);
for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  if (sched_energy_freq() &&
      (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
      ((capacity_curr_of(cpu) * 100) <
       (task_utilization(curr) * MARGIN_PCT))) {
          gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
  }
} /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif -int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.

From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points

remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity

modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); } +void gov_cfs_reset_cpu(int cpu) +{
  per_cpu(new_capacity, cpu) = 0;
+}

/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  hrtick_update(rq);
} @@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  if(sched_energy_freq()) {
          /*
           * Ask for an update only if we are not going idle.
           * If we are going idle we just need to clear our
           * current request.
           */
          if (rq->cfs.nr_running)
                  gov_cfs_update_cpu(cpu_of(rq),
                          rq->cfs.utilization_load_avg);
          else
                  gov_cfs_reset_cpu(cpu_of(rq));
  }
hrtick_update(rq); } @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; } +static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

select_task_rq_fair: Select target runqueue for the waking task in domains

that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
  /* We want to consider the pre-decayed utilization */
  if(sched_energy_freq())
          gov_cfs_update_cpu(new_cpu,
                             cpu_rq(new_cpu)->cfs.utilization_load_avg +
                             task_utilization(p));
  return new_cpu;
} @@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq) #endif /* CONFIG_SMP */ -static inline unsigned long task_utilization(struct task_struct *p) -{
  return p->se.avg.utilization_avg_contrib;
-}

/*

scheduler tick hitting a task of our scheduling class:

*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).

Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).

From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled

If the kthread is throttled we still need to update requests, or we may end-up with stale values.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity) gd = policy->governor_data;
  per_cpu(new_capacity, cpu) = capacity;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out: -- 2.2.2

From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing

This should cover load_balance paths (untested).

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned: ld_moved = 0; out:
  /* dst_grpmask might be NULL for NEWLY_IDLE. */
  if (sched_energy_freq() && ld_moved && env.dst_grpmask)
          /*
           * dequeue_task_fair() already took care of src_cpu
           */
          gov_cfs_update_cpu(env.dst_cpu,
                  cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
  return ld_moved;
} @@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
  if (p)
  if (p) {
          attach_one_task(target_rq, p);
          if (sched_energy_freq())
                  gov_cfs_update_cpu(cpu_of(target_rq),
                          target_rq->cfs.utilization_load_avg);
  }
local_irq_enable(); -- 2.2.2

Comments?

Thanks,

Juri

Juri Lelli

9:39 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 29/04/15 09:32, Michael Turquette wrote:

...

Quoting Juri Lelli (2015-04-28 10:48:27)

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!

Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.

...

So I decided to transmute the threshold into a margin. Instead of checking to see if we crossed some boundary we always try to maintain a bit of overhead. This works for table-based and table-less systems, and allows us to hit the minimum and maximum frequencies without any weird corner cases.

Ok, margin is fine. I kept that in my deltas below. I think we also need that to be able to stabilize capacity requests.

...

...
...
So we can't easily check if the cpu frequency needs to change or not in the scheduler hot path using this method.

You mean because in this case we don't have any reference to base such a threshold on?

Correct.

...
...
An alternative is to put the throttle check in the hot path and not kick the thread until we are unthrottled. I need to think on how to do this. I'd like to do it without locking, but mixing 64-bit ktime_t with 32-bit atomit_t is hard. Any ideas?

Don't we already bail out in gov_cfs_update_cpu() if we are not yet past the throttling threshold? This is in the hot path.

Yeah, but it is still hidden behind the rwsem. I'd to get rid of that too if I can think of a way.

Agreed, we should move away from that.

...

...
Anyway, I played a little bit with this version today and I came up with the following patches. The idea is to reduce triggering points, so that we - in theory - reduce the overall overhead of this thing. I ran simple synthetic workloads to test this, mainly task with phases and periodic workloads. I attach some plots to which I refer below, time on x-axis and freqs on y-axis.

With the first patch I tried to reduce the number of times we kick the kthread from task_tick_fair(). The idea is to extend the governor API so that we can ask for any capacity required (instead of letting it read the usage signal).

Fig1 shows a light/heavy/light task with the current implementation. As you pointed out in the rump up phase we slowly adapt to the new utilization (each step requires also kicking the kthread). Fig2 shows the up threshold approach where we kick the kthread and go to max only when needed. Patch follows.

Thanks for the patches. I'll review them and get back to you but I wanted to respond to your questions above asap.

Thanks!

Best,

- Juri

...

Regards, Mike

...
From 9f3d102e3f88d4e1d60c0d9497de709146e7f2ce Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 14:10:57 +0100 Subject: [PATCH 1/4] sched/cpufreq_sched_cfs: implement direct API

Instead of using get_cpu_usage() we can let each CPU request the capacity it needs. The gov's kthread is responsible for aggregating requests.

A benefit of this new API is shown in task_tick_fair(), where we can request a transition to max opp only when really needed.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 18 +++++++++++------- kernel/sched/fair.c | 26 +++++++++++++++++++++----- kernel/sched/sched.h | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 040469d..c8c6d2e 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -14,9 +14,10 @@

#include "sched.h"

-#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ #define THROTTLE_NSEC 50000000 /* 50ms default */

+static DEFINE_PER_CPU(unsigned long, new_capacity);

/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@@ -85,7 +86,7 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) * lockless behavior. */ for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          usage = per_cpu(new_capacity, cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
@@ -93,15 +94,13 @@ static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) /* add margin to max_usage based on imbalance_pct */ max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  if (max_usage >= capacity_orig_of(cpu)) {
  if (max_usage >= SCHED_LOAD_SCALE) {
          freq = policy->max;
          goto out;
  }

  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
  freq = (max_usage * policy->max) >> SCHED_LOAD_SHIFT;
out: return freq; @@ -201,7 +200,7 @@ static void gov_cfs_irq_work(struct irq_work *irq_work)

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/ -void gov_cfs_update_cpu(int cpu) +void gov_cfs_update_cpu(int cpu, unsigned long capacity) { struct cpufreq_policy *policy; struct gov_data *gd; @@ -223,6 +222,7 @@ void gov_cfs_update_cpu(int cpu) goto out; }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out: @@ -233,6 +233,7 @@ out: static void gov_cfs_start(struct cpufreq_policy *policy) { struct gov_data *gd;
  int cpu;

  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
@@ -251,6 +252,9 @@ static void gov_cfs_start(struct cpufreq_policy *policy) pr_debug("%s: throttle threshold = %u [ns]\n", __func__, gd->throttle_nsec);
  for_each_cpu(cpu, policy->related_cpus)
          per_cpu(new_capacity, cpu) = 0;
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 041538e..27e21a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4267,7 +4267,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

  hrtick_update(rq);
} @@ -4332,7 +4332,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) }
    if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);

  hrtick_update(rq);
} @@ -4800,6 +4800,12 @@ next: done: return target; }

+unsigned long capacity_curr_of(int cpu) +{
  return arch_scale_freq_capacity(NULL, cpu);
+}

/*

get_cpu_usage returns the amount of capacity of a CPU that is used by CFS

tasks. The unit of the return value must be the one of capacity so we can

@@ -4817,7 +4823,7 @@ done:

Without capping the usage, a group could be seen as overloaded (CPU0 usage

at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity

*/ -int get_cpu_usage(int cpu) +static int get_cpu_usage(int cpu) { unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -7820,6 +7826,11 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

scheduler tick hitting a task of our scheduling class:

*/ @@ -7827,6 +7838,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se;
  int cpu = task_cpu(curr);

  for_each_sched_entity(se) {
          cfs_rq = cfs_rq_of(se);
@@ -7838,8 +7850,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  if (sched_energy_freq() &&
      (capacity_curr_of(cpu) < SCHED_LOAD_SCALE) &&
      ((capacity_curr_of(cpu) * 100) <
       (task_utilization(curr) * MARGIN_PCT))) {
          gov_cfs_update_cpu(cpu_of(rq), SCHED_LOAD_SCALE);
  }
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec23523..3983bd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1396,11 +1396,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif

-int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu); +unsigned long capacity_curr_of(int cpu);

#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS -void gov_cfs_update_cpu(int cpu); +#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +void gov_cfs_update_cpu(int cpu, unsigned long capacity); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

I then tried to address the tail effect when the task starts behaving as light again (~2sec in the pictures). The problem is that when we dequeue the task we see no utilization on the rq and we go to min. When the task is enqueued back we go to max (at least the first time) and then we continue doing this sort of ping pong until we converge to the actual (light) utilization. The following patch changes this behaviour as in Fig.3: in the tail we slowly adapt to the new task phase considering the decaying effect of the utilization.

From 81cefca25fa022913dc2913acf71414925b997eb Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:08:37 +0100 Subject: [PATCH 2/4] sched/cpufreq_sched_cfs: (re)move triggering points

remove the trigger in enqueue_task_fair and move it in select_task_rq_fair(), also consider the pre-decayed tasks utilization as we want to stabilize capacity

modify dequeue_task_fair() trigger; don't scale down when we are going idle (this change requires a small addition to governor's API)

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 5 +++++ kernel/sched/fair.c | 32 ++++++++++++++++++++++---------- kernel/sched/sched.h | 1 + 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index c8c6d2e..2fe1684 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -187,6 +187,11 @@ static void gov_cfs_irq_work(struct irq_work *irq_work) wake_up_process(gd->task); }

+void gov_cfs_reset_cpu(int cpu) +{
  per_cpu(new_capacity, cpu) = 0;
+}

/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 27e21a1..4e21abf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4266,9 +4266,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  hrtick_update(rq);
}

@@ -4331,8 +4328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq), rq->cfs.utilization_load_avg);
  if(sched_energy_freq()) {
          /*
           * Ask for an update only if we are not going idle.
           * If we are going idle we just need to clear our
           * current request.
           */
          if (rq->cfs.nr_running)
                  gov_cfs_update_cpu(cpu_of(rq),
                          rq->cfs.utilization_load_avg);
          else
                  gov_cfs_reset_cpu(cpu_of(rq));
  }

  hrtick_update(rq);
} @@ -4834,6 +4841,11 @@ static int get_cpu_usage(int cpu) return (usage * capacity) >> SCHED_LOAD_SHIFT; }

+static inline unsigned long task_utilization(struct task_struct *p) +{
  return p->se.avg.utilization_avg_contrib;
+}

/*

select_task_rq_fair: Select target runqueue for the waking task in domains

that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

@@ -4922,6 +4934,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f unlock: rcu_read_unlock();
  /* We want to consider the pre-decayed utilization */
  if(sched_energy_freq())
          gov_cfs_update_cpu(new_cpu,
                             cpu_rq(new_cpu)->cfs.utilization_load_avg +
                             task_utilization(p));
  return new_cpu;
}

@@ -7826,11 +7843,6 @@ static void rq_offline_fair(struct rq *rq)

#endif /* CONFIG_SMP */

-static inline unsigned long task_utilization(struct task_struct *p) -{
  return p->se.avg.utilization_avg_contrib;
-}

/*

scheduler tick hitting a task of our scheduling class:

*/ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3983bd6..6dd8f3a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1402,6 +1402,7 @@ unsigned long capacity_curr_of(int cpu); #ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS #define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ void gov_cfs_update_cpu(int cpu, unsigned long capacity); +void gov_cfs_reset_cpu(int cpu); #else static inline void gov_cfs_update_cpu(int cpu) {}

#endif

2.2.2

This approach seems to work also for a light/medium/light task. We go to max and then adapt the the real (medium) utilization (Fig.4).

Finally a couple of patches more (first one should actually be squashed in 01/04) to cover load_balancing (not really tested already).

From 3e7226989c21fdd680279f4f8a150597b5833b95 Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:10:00 +0100 Subject: [PATCH 3/4] sched/cpufreq_sched_cfs: update requested capacity even when throttled

If the kthread is throttled we still need to update requests, or we may end-up with stale values.

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/cpufreq_sched_cfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c index 2fe1684..c8d9408 100644 --- a/kernel/sched/cpufreq_sched_cfs.c +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -222,12 +222,13 @@ void gov_cfs_update_cpu(int cpu, unsigned long capacity)
    gd = policy->governor_data;
  per_cpu(new_capacity, cpu) = capacity;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  per_cpu(new_capacity, cpu) = capacity;
  irq_work_queue_on(&gd->irq_work, cpu);
out:

2.2.2

From 333e1741c7de8dfc21f5bb9f2a9c29d4dc84f2de Mon Sep 17 00:00:00 2001 From: Juri Lelli juri.lelli@arm.com Date: Tue, 28 Apr 2015 16:55:22 +0100 Subject: [PATCH 4/4] sched/fair: cpufreq_sched_cfs triggers for load_balancing

This should cover load_balance paths (untested).

Signed-off-by: Juri Lelli juri.lelli@arm.com

kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e21abf..ad1e7cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7203,6 +7203,14 @@ out_one_pinned:
    ld_moved = 0;
out:
  /* dst_grpmask might be NULL for NEWLY_IDLE. */
  if (sched_energy_freq() && ld_moved && env.dst_grpmask)
          /*
           * dequeue_task_fair() already took care of src_cpu
           */
          gov_cfs_update_cpu(env.dst_cpu,
                  cpu_rq(env.dst_cpu)->cfs.utilization_load_avg);
  return ld_moved;
}

@@ -7402,8 +7410,12 @@ out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock(&busiest_rq->lock);
  if (p)
  if (p) {
          attach_one_task(target_rq, p);
          if (sched_energy_freq())
                  gov_cfs_update_cpu(cpu_of(target_rq),
                          target_rq->cfs.utilization_load_avg);
  }

  local_irq_enable();
-- 2.2.2

Comments?

Thanks,

Juri

Ashwin Chaugule

4 May 4 May

1:41 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Juri,

On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:

...

On 29/04/15 09:32, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-28 10:48:27)

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
...
  wake_up_process(gd->task);
So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.
This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".
Can you make an example of such systems?
CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!
Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.

IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.

...

From the energy model perspective, can a continuous performance band

be supported at all or is it a hard requirement to have a discretized table?

Regards, Ashwin.

Juri Lelli

5 May 5 May

10:12 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Ashwin,

On 04/05/15 14:41, Ashwin Chaugule wrote:

...

Hi Juri,

On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:

...
On 29/04/15 09:32, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-28 10:48:27)

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
...
> + > + wake_up_process(gd->task);

So, we always wake up the kthread, even when we know that we won't need a freq change. This might be, I fear, an almost certain source of reasonable complain and pushback. I understand that we might not want to start optimizing things, but IMHO this point deserves some more thought before posting. Don't you think we could do some level of aggregation before kicking the kthread? In task_tick_fair(), for example, we could just check if we are beyond the 25% threshold and kick the kthread only in that case.

This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".

Can you make an example of such systems?

CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!

Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.

IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.

So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.

I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.

...

From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?

I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).

Thanks,

- Juri

Mike Turquette

6 May 6 May

12:58 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:

...

Hi Ashwin,

On 04/05/15 14:41, Ashwin Chaugule wrote:

...
Hi Juri,

On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:

...
On 29/04/15 09:32, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-28 10:48:27)

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

...
>> + >> + wake_up_process(gd->task); > > So, we always wake up the kthread, even when we know that we won't > need a freq change. This might be, I fear, an almost certain source of > reasonable complain and pushback. I understand that we might not want > to start optimizing things, but IMHO this point deserves some more > thought before posting. Don't you think we could do some level of > aggregation before kicking the kthread? In task_tick_fair(), for > example, we could just check if we are beyond the 25% threshold and kick > the kthread only in that case.

This patch does not check against a threshold. It always requests a rate based on the current utilization plus 25%.

On systems with discretized cpu frequencies (opps) we will often target the same opp, occasionally crossing the boundary into another opp. On systems with continuous cpu frequencies we will continually give ourselves "room to grow".

Can you make an example of such systems?

CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!

Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.

IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.

So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.

I'm not sure about jumping to the max frequency when we detect that the signal is saturated.

Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.

So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.

...

I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.

...
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?

I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).

We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.

Regards, Mike

...

Thanks,

Juri

Juri Lelli

8:34 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Hi Mike,

On 06/05/15 01:58, Mike Turquette wrote:

...

On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:

...
Hi Ashwin,

On 04/05/15 14:41, Ashwin Chaugule wrote:

...
Hi Juri,

On 29 April 2015 at 05:39, Juri Lelli juri.lelli@arm.com wrote:

...
On 29/04/15 09:32, Michael Turquette wrote:

...
Quoting Juri Lelli (2015-04-28 10:48:27)

...
Hi Mike,

I apologize in advance for the long email, but I'd still want to share with you today's thoughts :).

On 28/04/15 05:02, Michael Turquette wrote: > Quoting Juri Lelli (2015-04-27 10:09:50)

[snip]

>>> + >>> + wake_up_process(gd->task); >> >> So, we always wake up the kthread, even when we know that we won't >> need a freq change. This might be, I fear, an almost certain source of >> reasonable complain and pushback. I understand that we might not want >> to start optimizing things, but IMHO this point deserves some more >> thought before posting. Don't you think we could do some level of >> aggregation before kicking the kthread? In task_tick_fair(), for >> example, we could just check if we are beyond the 25% threshold and kick >> the kthread only in that case. > > This patch does not check against a threshold. It always requests a rate > based on the current utilization plus 25%. > > On systems with discretized cpu frequencies (opps) we will often target > the same opp, occasionally crossing the boundary into another opp. On > systems with continuous cpu frequencies we will continually give > ourselves "room to grow". >

Can you make an example of such systems?

CPPC-based systems.

I thought a lot about all of the feedback that my v1 patchset got last week on eas-dev. Two comments in particular colored my views on supporting continuous frequency bands and not relying on a threshold.

First is Ashwins' comment here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000093.html

Second is Morten's reply here: https://lists.linaro.org/pipermail/eas-dev/2015-April/000094.html

If we decide that we only care about opps then it is easy to create a threshold for the opp "bucket" that we are currently in. But in a continuous system creating a threshold is more difficult. E.g. if we have decide to use an 80% threshold for a continuous system, we can easily determine if our current utilization exceeds this threshold at our current capacity/frequency. But what is the new frequency target? Without a table to guide us we have to just make something up!

Right, but I'm still not sure that we still want to continuously adapt to the current usage (plus the margin) as we might introduce too much overhead. Also, is it really worthy when we have to activate all this just to save a little more power or go a little more fast? This is really blue sky, but maybe a trade-off would be to try to discretize such systems (if it makes sense to control them from the scheduler). Yes, we already have an activation threshold, but I'm not sure this is enough.

IIUC, the optimization you're getting at is to suppress the CPU freq requests when it falls within some range of the current OPP? I think this may hamper certain latency sensitive workloads, since the freq ramp up could be potentially slowed down. So, theres some merit in making the request path as quick as possible and allow for continuous adaptation. I need to look at your patches in more detail, but eyeballing it seems like you're trying to achieve that.

So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.

I'm not sure about jumping to the max frequency when we detect that the signal is saturated.

Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.

So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.

No doubt about this :).

I got your point, but I guess it should be fairly easy to make this freq at which we jump somewhat "configurable". Makes sense to me, considering the variety of shapes power-perf curves can have, for example.

...

...
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.

...
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?

I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).

We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.

Agree. That's what I was thinking with "discretize continuous systems".

Best,

- Juri

Ashwin Chaugule

8:03 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 6 May 2015 at 04:34, Juri Lelli juri.lelli@arm.com wrote:

...

Hi Mike, On 06/05/15 01:58, Mike Turquette wrote:

...
On Tue, May 5, 2015 at 3:12 AM, Juri Lelli juri.lelli@arm.com wrote:

...
Hi Ashwin,

So, the energy model (and please mind that the patches on top of Mike's patchset don't have that yet) currently gives you these "capacity bands". The idea is to try to adapt the OPP selection to the usage you see on your CPU/cluster. Since the usage signal is subject to saturation, what I'm trying to do is to avoid this condition by jumping up to the max available OPP when we realize that we are going to saturate a particular OPP. After we run for a small interval of time (say a tick) at that max OPP we can better estimate the real usage and directly select an OPP ("capacity band") that suits it.

I'm not sure about jumping to the max frequency when we detect that the signal is saturated.

Ondemand has similar behavior to this and many vendors have implemented out-of-tree solutions that do something like setting the frequency to an "intermediate" rate (maybe 2/3 of the total performance band) and then re-evaluate if they need to jump to max performance after another sampling period.

So at some point you might face the same issue where vendors find this approach too aggressive and wastes too much power, thus some intermediate level will be introduced. I'm not providing you any solutions here, but I'm saying that designing a policy algorithm that works well for everyone is super hard.

No doubt about this :).

I got your point, but I guess it should be fairly easy to make this freq at which we jump somewhat "configurable". Makes sense to me, considering the variety of shapes power-perf curves can have, for example.

Instead of another knob, perhaps we could make the 25% headroom flexible by adapting it to current vs past utilization? Not something we need to start off with, but a possible future optimization.

...

...
...
I see your point, though. I think the two approaches differ for how we get to the desired capacity: ramping up from bottom vs. selecting from top.

...
From the energy model perspective, can a continuous performance band be supported at all or is it a hard requirement to have a discretized table?

I don't think it's a hard requirement (Morten or Dietmar may correct me here), but just an abstraction of the systems we develop onto today. I guess we would need to compute some formulas at run time, instead of reading tabular values, if we want to have continuous performance bands. Food for thought :).

We could also tablify continuous frequency domains based on some reasonable factor like 50Mhz or something. I guess that factor could even be supplied by the driver.

Agree. That's what I was thinking with "discretize continuous systems".

Sounds possible. Probably not a big deal, but theres a chance of losing out some power optimization depending on how many discrete steps you make. A matter of system profiling I guess.

Regards, Ashwin.

Vincent Guittot

28 Apr 28 Apr

2:10 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 27 April 2015 at 09:46, Michael Turquette mturquette@linaro.org wrote:

...

Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
  bool "sched_cfs"
  select CPU_FREQ_GOV_SCHED_CFS
  select CPU_FREQ_GOV_PERFORMANCE

Hi Mike,

do you really need to select CPU_FREQ_GOV_PERFORMANCE ?

...

  help
    Use the CPUfreq governor 'sched_cfs' as default. This scales
    cpu frequency from the scheduler as per-entity load tracking
    statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
      If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
  tristate "'sched cfs' cpufreq governor"
  depends on CPU_FREQ
  select CPU_FREQ_GOV_COMMON
  help
    'sched_cfs' - this governor scales cpu frequency from the
    scheduler as a function of cpu capacity utilization. It does
    not evaluate utilization on a periodic basis (as ondemand
    does) but instead is invoked from the completely fair
    scheduler when updating per-entity load tracking statistics.
    Latency to respond to changes in load is improved over polling
    governors due to its event-driven design.
    If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {
  ktime_t throttle;
  unsigned int throttle_nsec;
  struct task_struct *task;
  struct irq_work irq_work;
  struct cpufreq_policy *policy;
+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.

This approach attempts to maintain headroom of 25% unutilized cpu capacity.

A traditional way of doing this is to take 75% of the current capacity and

check if current utilization crosses that threshold. The only problem with

that approach is determining the next cpu frequency target if that threshold

is crossed.

Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%

utilization margin to the utilization and converts that to a frequency. This

removes conditional logic around checking thresholds and better supports

drivers that use non-discretized frequency ranges (i.e. no pre-defined

frequency tables or operating points).

Returns frequency selected.

*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
  int cpu = 0;
  struct gov_data *gd;
  unsigned long freq = 0, max_usage = 0, usage = 0;
  if (!policy->governor_data)
          goto out;
  gd = policy->governor_data;
  /*
   * get_cpu_usage is called without locking the runqueues. This is the
   * same behavior used by find_busiest_cpu in load_balance. We are
   * willing to accept occasionally stale data here in exchange for
   * lockless behavior.
   */
  for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
  /* add margin to max_usage based on imbalance_pct */
  max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
  return freq;
+}

+/*

we pass in struct cpufreq_policy. This is safe because changing out the

policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),

which tears down all of the data structures and __cpufreq_governor(policy,

CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the

new policy pointer

*/

+static int gov_cfs_thread(void *data) +{
  struct sched_param param;
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  unsigned long freq;
  int ret;
  policy = (struct cpufreq_policy *) data;
  if (!policy) {
          pr_warn("%s: missing policy\n", __func__);
          do_exit(-EINVAL);
  }
  gd = policy->governor_data;
  if (!gd) {
          pr_warn("%s: missing governor data\n", __func__);
          do_exit(-EINVAL);
  }
  param.sched_priority = 50;
  ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
  if (ret) {
          pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
          do_exit(-EINVAL);
  } else {
          pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
                          __func__, gd->task->pid);
  }
  ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
  if (ret) {
          pr_warn("%s: failed to set allowed ptr\n", __func__);
          do_exit(-EINVAL);
  }
  /* main loop of the per-policy kthread */
  do {
          set_current_state(TASK_INTERRUPTIBLE);
          schedule();
          if (kthread_should_stop())
                  break;
          /* avoid race with gov_cfs_stop */
          if (!down_write_trylock(&policy->rwsem))
                  continue;
          freq = gov_cfs_select_freq(policy);
          ret = __cpufreq_driver_target(policy, freq,
                          CPUFREQ_RELATION_H);
          if (ret)
                  pr_debug("%s: __cpufreq_driver_target returned %d\n",
                                  __func__, ret);
          gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
          up_write(&policy->rwsem);
  } while (!kthread_should_stop());
  do_exit(0);
+}

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
  struct gov_data *gd;
  gd = container_of(irq_work, struct gov_data, irq_work);
  if (!gd) {
          return;
  }
  wake_up_process(gd->task);
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  /* XXX put policy pointer in per-cpu data? */
  policy = cpufreq_cpu_get(cpu);
  if (IS_ERR_OR_NULL(policy)) {
          return;
  }
  if (!policy->governor_data) {
          goto out;
  }
  gd = policy->governor_data;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  irq_work_queue_on(&gd->irq_work, cpu);
+out:
  cpufreq_cpu_put(policy);
  return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
  if (!gd) {
          pr_debug("%s: failed to allocate private data\n", __func__);
          return;
  }
  /*
   * Don't ask for freq changes at an higher rate than what
   * the driver advertises as transition latency.
   */
  gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                      policy->cpuinfo.transition_latency :
                      THROTTLE_NSEC;
  pr_debug("%s: throttle threshold = %u [ns]\n",
            __func__, gd->throttle_nsec);
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
          pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
  init_irq_work(&gd->irq_work, gov_cfs_irq_work);
  policy->governor_data = gd;
  gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  gd = policy->governor_data;
  kthread_stop(gd->task);
  policy->governor_data = NULL;
  /* FIXME replace with devm counterparts? */
  kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
  switch (event) {
          case CPUFREQ_GOV_START:
                  /* Start managing the frequency */
                  gov_cfs_start(policy);
                  return 0;
          case CPUFREQ_GOV_STOP:
                  gov_cfs_stop(policy);
                  return 0;
          case CPUFREQ_GOV_LIMITS:        /* unused */
          case CPUFREQ_GOV_POLICY_INIT:   /* unused */
          case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                  break;
  }
  return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
  .name                   = "gov_cfs",
  .governor               = gov_cfs_setup,
  .owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
  return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
  cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Michael Turquette

29 Apr 29 Apr

8:20 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Vincent Guittot (2015-04-28 07:10:20)

...

On 27 April 2015 at 09:46, Michael Turquette mturquette@linaro.org wrote:

...
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
  bool "sched_cfs"
  select CPU_FREQ_GOV_SCHED_CFS
  select CPU_FREQ_GOV_PERFORMANCE
Hi Mike,

do you really need to select CPU_FREQ_GOV_PERFORMANCE ?

Ondemand and conservative governors do this, so I copied that style. I guess the idea is that a production system should always have the option to run flat out without requiring to recompile the kernel, or a kernel module.

Regards, Mike

...

...
  help
    Use the CPUfreq governor 'sched_cfs' as default. This scales
    cpu frequency from the scheduler as per-entity load tracking
    statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
      If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
  tristate "'sched cfs' cpufreq governor"
  depends on CPU_FREQ
  select CPU_FREQ_GOV_COMMON
  help
    'sched_cfs' - this governor scales cpu frequency from the
    scheduler as a function of cpu capacity utilization. It does
    not evaluate utilization on a periodic basis (as ondemand
    does) but instead is invoked from the completely fair
    scheduler when updating per-entity load tracking statistics.
    Latency to respond to changes in load is improved over polling
    governors due to its event-driven design.
    If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {
  ktime_t throttle;
  unsigned int throttle_nsec;
  struct task_struct *task;
  struct irq_work irq_work;
  struct cpufreq_policy *policy;
+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.

This approach attempts to maintain headroom of 25% unutilized cpu capacity.

A traditional way of doing this is to take 75% of the current capacity and

check if current utilization crosses that threshold. The only problem with

that approach is determining the next cpu frequency target if that threshold

is crossed.

Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%

utilization margin to the utilization and converts that to a frequency. This

removes conditional logic around checking thresholds and better supports

drivers that use non-discretized frequency ranges (i.e. no pre-defined

frequency tables or operating points).

Returns frequency selected.

*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
  int cpu = 0;
  struct gov_data *gd;
  unsigned long freq = 0, max_usage = 0, usage = 0;
  if (!policy->governor_data)
          goto out;
  gd = policy->governor_data;
  /*
   * get_cpu_usage is called without locking the runqueues. This is the
   * same behavior used by find_busiest_cpu in load_balance. We are
   * willing to accept occasionally stale data here in exchange for
   * lockless behavior.
   */
  for_each_cpu(cpu, policy->cpus) {
          usage = get_cpu_usage(cpu);
          if (usage > max_usage)
                  max_usage = usage;
  }
  /* add margin to max_usage based on imbalance_pct */
  max_usage = max_usage * MARGIN_PCT / 100;
  cpu = cpumask_first(policy->cpus);
  /* freq is current utilization + 25% */
  freq = max_usage * policy->max / capacity_orig_of(cpu);
+out:
  return freq;
+}

+/*

we pass in struct cpufreq_policy. This is safe because changing out the

policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),

which tears down all of the data structures and __cpufreq_governor(policy,

CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the

new policy pointer

*/

+static int gov_cfs_thread(void *data) +{
  struct sched_param param;
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  unsigned long freq;
  int ret;
  policy = (struct cpufreq_policy *) data;
  if (!policy) {
          pr_warn("%s: missing policy\n", __func__);
          do_exit(-EINVAL);
  }
  gd = policy->governor_data;
  if (!gd) {
          pr_warn("%s: missing governor data\n", __func__);
          do_exit(-EINVAL);
  }
  param.sched_priority = 50;
  ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
  if (ret) {
          pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
          do_exit(-EINVAL);
  } else {
          pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
                          __func__, gd->task->pid);
  }
  ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
  if (ret) {
          pr_warn("%s: failed to set allowed ptr\n", __func__);
          do_exit(-EINVAL);
  }
  /* main loop of the per-policy kthread */
  do {
          set_current_state(TASK_INTERRUPTIBLE);
          schedule();
          if (kthread_should_stop())
                  break;
          /* avoid race with gov_cfs_stop */
          if (!down_write_trylock(&policy->rwsem))
                  continue;
          freq = gov_cfs_select_freq(policy);
          ret = __cpufreq_driver_target(policy, freq,
                          CPUFREQ_RELATION_H);
          if (ret)
                  pr_debug("%s: __cpufreq_driver_target returned %d\n",
                                  __func__, ret);
          gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
          up_write(&policy->rwsem);
  } while (!kthread_should_stop());
  do_exit(0);
+}

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
  struct gov_data *gd;
  gd = container_of(irq_work, struct gov_data, irq_work);
  if (!gd) {
          return;
  }
  wake_up_process(gd->task);
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
  struct cpufreq_policy *policy;
  struct gov_data *gd;
  /* XXX put policy pointer in per-cpu data? */
  policy = cpufreq_cpu_get(cpu);
  if (IS_ERR_OR_NULL(policy)) {
          return;
  }
  if (!policy->governor_data) {
          goto out;
  }
  gd = policy->governor_data;
  /* bail early if we are throttled */
  if (ktime_before(ktime_get(), gd->throttle)) {
          goto out;
  }
  irq_work_queue_on(&gd->irq_work, cpu);
+out:
  cpufreq_cpu_put(policy);
  return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  /* prepare per-policy private data */
  gd = kzalloc(sizeof(*gd), GFP_KERNEL);
  if (!gd) {
          pr_debug("%s: failed to allocate private data\n", __func__);
          return;
  }
  /*
   * Don't ask for freq changes at an higher rate than what
   * the driver advertises as transition latency.
   */
  gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                      policy->cpuinfo.transition_latency :
                      THROTTLE_NSEC;
  pr_debug("%s: throttle threshold = %u [ns]\n",
            __func__, gd->throttle_nsec);
  /* init per-policy kthread */
  gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
  if (IS_ERR_OR_NULL(gd->task))
          pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
  init_irq_work(&gd->irq_work, gov_cfs_irq_work);
  policy->governor_data = gd;
  gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
  struct gov_data *gd;
  gd = policy->governor_data;
  kthread_stop(gd->task);
  policy->governor_data = NULL;
  /* FIXME replace with devm counterparts? */
  kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
  switch (event) {
          case CPUFREQ_GOV_START:
                  /* Start managing the frequency */
                  gov_cfs_start(policy);
                  return 0;
          case CPUFREQ_GOV_STOP:
                  gov_cfs_stop(policy);
                  return 0;
          case CPUFREQ_GOV_LIMITS:        /* unused */
          case CPUFREQ_GOV_POLICY_INIT:   /* unused */
          case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                  break;
  }
  return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
  .name                   = "gov_cfs",
  .governor               = gov_cfs_setup,
  .owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
  return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
  cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
  hrtick_update(rq);
}

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
    update_rq_runnable_avg(rq, 1);
  if(sched_energy_freq())
          gov_cfs_update_cpu(cpu_of(rq));
}

/* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- 1.9.1

eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Daniel Lezcano

30 Apr 30 Apr

3:05 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...

Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"

select CPU_FREQ_GOV_SCHED_CFS

select CPU_FREQ_GOV_PERFORMANCE

help
 Use the CPUfreq governor 'sched_cfs' as default. This scales
 cpu frequency from the scheduler as per-entity load tracking
 statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
 If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"

depends on CPU_FREQ

select CPU_FREQ_GOV_COMMON

help
 'sched_cfs' - this governor scales cpu frequency from the
 scheduler as a function of cpu capacity utilization. It does
 not evaluate utilization on a periodic basis (as ondemand
 does) but instead is invoked from the completely fair
 scheduler when updating per-entity load tracking statistics.
 Latency to respond to changes in load is improved over polling
 governors due to its event-driven design.
 If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {

ktime_t throttle;

unsigned int throttle_nsec;

struct task_struct *task;

struct irq_work irq_work;

struct cpufreq_policy *policy;

+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.

This approach attempts to maintain headroom of 25% unutilized cpu capacity.

A traditional way of doing this is to take 75% of the current capacity and

check if current utilization crosses that threshold. The only problem with

that approach is determining the next cpu frequency target if that threshold

is crossed.

Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%

utilization margin to the utilization and converts that to a frequency. This

removes conditional logic around checking thresholds and better supports

drivers that use non-discretized frequency ranges (i.e. no pre-defined

frequency tables or operating points).

Returns frequency selected.

*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;

struct gov_data *gd;

unsigned long freq = 0, max_usage = 0, usage = 0;

if (!policy->governor_data)
goto out;

s/goto out/return 0/

...

gd = policy->governor_data;

/*
* get_cpu_usage is called without locking the runqueues. This is the
* same behavior used by find_busiest_cpu in load_balance. We are
* willing to accept occasionally stale data here in exchange for
* lockless behavior.
*/
for_each_cpu(cpu, policy->cpus) {
usage = get_cpu_usage(cpu);
if (usage > max_usage)
	max_usage = usage;
}

/* add margin to max_usage based on imbalance_pct */

max_usage = max_usage * MARGIN_PCT / 100;

cpu = cpumask_first(policy->cpus);

/* freq is current utilization + 25% */

freq = max_usage * policy->max / capacity_orig_of(cpu);

Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?

...

+out:

return freq;

+}

+/*

we pass in struct cpufreq_policy. This is safe because changing out the

policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),

which tears down all of the data structures and __cpufreq_governor(policy,

CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the

new policy pointer

*/

+static int gov_cfs_thread(void *data) +{
struct sched_param param;

struct cpufreq_policy *policy;

struct gov_data *gd;

unsigned long freq;

int ret;

policy = (struct cpufreq_policy *) data;

if (!policy) {
pr_warn("%s: missing policy\n", __func__);
do_exit(-EINVAL);
}

gd = policy->governor_data;

if (!gd) {
pr_warn("%s: missing governor data\n", __func__);
do_exit(-EINVAL);
}

param.sched_priority = 50;

ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);

if (ret) {
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
do_exit(-EINVAL);
} else {
pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
		__func__, gd->task->pid);
}

ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);

if (ret) {
pr_warn("%s: failed to set allowed ptr\n", __func__);
do_exit(-EINVAL);
}

/* main loop of the per-policy kthread */

do {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
if (kthread_should_stop())
	break;
/* avoid race with gov_cfs_stop */
if (!down_write_trylock(&policy->rwsem))
	continue;
freq = gov_cfs_select_freq(policy);
ret = __cpufreq_driver_target(policy, freq,
		CPUFREQ_RELATION_H);
if (ret)
	pr_debug("%s: __cpufreq_driver_target returned %d\n",
			__func__, ret);

Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?

...

gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
up_write(&policy->rwsem);
} while (!kthread_should_stop());

do_exit(0);
+}

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;

gd = container_of(irq_work, struct gov_data, irq_work);

if (!gd) {
return;
}

wake_up_process(gd->task);
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;

struct gov_data *gd;

/* XXX put policy pointer in per-cpu data? */

policy = cpufreq_cpu_get(cpu);

if (IS_ERR_OR_NULL(policy)) {
return;
}

if (!policy->governor_data) {
goto out;
}

gd = policy->governor_data;

/* bail early if we are throttled */

if (ktime_before(ktime_get(), gd->throttle)) {
goto out;
}

irq_work_queue_on(&gd->irq_work, cpu);
+out:

cpufreq_cpu_put(policy);

return;

+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;

/* prepare per-policy private data */

gd = kzalloc(sizeof(*gd), GFP_KERNEL);

if (!gd) {
pr_debug("%s: failed to allocate private data\n", __func__);
return;
}

/*
* Don't ask for freq changes at an higher rate than what
* the driver advertises as transition latency.
*/
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
	    policy->cpuinfo.transition_latency :
	    THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
  __func__, gd->throttle_nsec);
/* init per-policy kthread */

gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");

if (IS_ERR_OR_NULL(gd->task))
pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);

It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.

I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

...

policy->governor_data = gd;

gd->policy = policy;

+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{

struct gov_data *gd;

gd = policy->governor_data;

kthread_stop(gd->task);

policy->governor_data = NULL;

/* FIXME replace with devm counterparts? */

kfree(gd);

+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
case CPUFREQ_GOV_START:
	/* Start managing the frequency */
	gov_cfs_start(policy);
	return 0;
case CPUFREQ_GOV_STOP:
	gov_cfs_stop(policy);
	return 0;
case CPUFREQ_GOV_LIMITS:	/* unused */
case CPUFREQ_GOV_POLICY_INIT:	/* unused */
case CPUFREQ_GOV_POLICY_EXIT:	/* unused */
	break;
}

return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {

.name = "gov_cfs",

.governor = gov_cfs_setup,

.owner = THIS_MODULE,

+};

+static int __init gov_cfs_init(void) +{

return cpufreq_register_governor(&cpufreq_gov_cfs);

+}

+static void __exit gov_cfs_exit(void) +{

cpufreq_unregister_governor(&cpufreq_gov_cfs);

+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq); }
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq); }
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);

update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Michael Turquette

11:49 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Daniel Lezcano (2015-04-30 08:05:52)

...

On 04/27/2015 09:46 AM, Michael Turquette wrote:

...
Scheduler-driven cpu frequency selection is desirable as part of the on-going effort to make the scheduler better aware of energy consumption. No piece of the Linux kernel has a better view of the factors that affect a cpu frequency selection policy than the scheduler[0], and this patch is an attempt to get that discussion going again.

This patch implements a cpufreq governor, sched_cfs, that directly accesses scheduler statistics, in particular the pelt data from cfs via the get_cpu_usage() function.

Put plainly, sched_cfs selects the lowest cpu frequency that will prevent a runqueue from being over-utilized (until we hit the highest frequency of course). This is done by requestiong a frequency which is equivalent to the current capacity utilization, plus a margin.

Unlike the previous posting from 2014[1] this governor implements a "follow the usage" method, where usage is defined as the cpu frequency-invariant product of utilization_load_avg and cpu_capacity_orig.

This governor is event-driven. There is no polling loop to check cpu idle time, or any other method which is unsynchronized with the scheduler. The entry points for this policy are in fair.c: enqueue_task_fair, dequeue_task_fair and task_tick_fair.

This policy is implemented using the cpufreq governor interface for two main reasons:

re-using the cpufreq machine drivers without using the governor

interface is hard.

using the cpufreq interface allows us to switch between the

scheduler-driven policy and legacy cpufreq governors such as ondemand at run-time. This is very useful for comparative testing and tuning.

Finally, it is worth mentioning that this approach neglects all scheduling classes except for cfs. It is possible to add support for deadline and other other classes here, but I also wonder if a multi-governor approach would be a more maintainable solution, where the cpufreq core aggregates the constraints set by multiple governors. Supporting such an approach in the cpufreq core would also allow for peripheral devices to place constraint on cpu frequency without having to hack such behavior in at the governor level.

Thanks to Juri Lelli juri.lelli@arm.com for doing a good bit of testing, bug fixing and contributing towards the design.

[0] http://article.gmane.org/gmane.linux.kernel/1499836 [1] https://lkml.org/lkml/2014/10/22/22

Signed-off-by: Michael Turquette mturquette@linaro.org

changes since internal v1:

renamed everything

fixed possible deadlock between gov_cfs_thread and gov_cfs_stop

replaced direct usage-to-frequency mapping with usage+margin-to-frequency mapping. This functions like an up_threshold and allows us to easily work with non-discretized frequency ranges

usage-to-frequency calculation now uses capacity_orig instead of SCHED_LOAD_SCALE to handle SMT and asymmetric cpu use cases

dropped workqueue method due to instability

kthread is woken up by irq_work handler. This removes the need for cap_gov_kick_thread() from v1

drivers/cpufreq/Kconfig | 24 +++ include/linux/cpufreq.h | 3 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched_cfs.c | 314 +++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 11 ++ kernel/sched/sched.h | 6 + 6 files changed, 359 insertions(+) create mode 100644 kernel/sched/cpufreq_sched_cfs.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a171fef..35ba9c3 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,15 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor.

+config CPU_FREQ_DEFAULT_GOV_SCHED_CFS
bool "sched_cfs"
select CPU_FREQ_GOV_SCHED_CFS
select CPU_FREQ_GOV_PERFORMANCE
help
  Use the CPUfreq governor 'sched_cfs' as default. This scales
  cpu frequency from the scheduler as per-entity load tracking
  statistics are updated.
endchoice

config CPU_FREQ_GOV_PERFORMANCE
@@ -183,6 +192,21 @@ config CPU_FREQ_GOV_CONSERVATIVE
    If in doubt, say N.
+config CPU_FREQ_GOV_SCHED_CFS
tristate "'sched cfs' cpufreq governor"
depends on CPU_FREQ
select CPU_FREQ_GOV_COMMON
help
  'sched_cfs' - this governor scales cpu frequency from the
  scheduler as a function of cpu capacity utilization. It does
  not evaluate utilization on a periodic basis (as ondemand
  does) but instead is invoked from the completely fair
  scheduler when updating per-entity load tracking statistics.
  Latency to respond to changes in load is improved over polling
  governors due to its event-driven design.
  If in doubt, say N.
comment "CPU frequency scaling drivers"

config CPUFREQ_DT
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..62e8152 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -485,6 +485,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CAP_GOV) +extern struct cpufreq_governor cpufreq_gov_cap_gov; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_cap_gov) #endif

/********************************************************************* diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 46be870..003b592 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED_CFS) += cpufreq_sched_cfs.o diff --git a/kernel/sched/cpufreq_sched_cfs.c b/kernel/sched/cpufreq_sched_cfs.c new file mode 100644 index 0000000..746b220 --- /dev/null +++ b/kernel/sched/cpufreq_sched_cfs.c @@ -0,0 +1,314 @@ +/*

Copyright (C) 2015 Michael Turquette mturquette@linaro.org

This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License version 2 as

published by the Free Software Foundation.

*/

+#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h>

+#include "sched.h"

+#define MARGIN_PCT 125 /* taken from imbalance_pct = 125 */ +#define THROTTLE_NSEC 50000000 /* 50ms default */

+/**

gov_data - per-policy data internal to the governor

@throttle: next throttling period expiry. Derived from throttle_nsec

@throttle_nsec: throttle period length in nanoseconds

@task: worker thread for dvfs transition that may block/sleep

@irq_work: callback used to wake up worker thread

struct gov_data is the per-policy gov_cfs-specific data structure. A

per-policy instance of it is created when the gov_cfs governor receives

the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data

member of struct cpufreq_policy.

Readers of this data must call down_read(policy->rwsem). Writers must

call down_write(policy->rwsem).

*/

+struct gov_data {
ktime_t throttle;
unsigned int throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
struct cpufreq_policy *policy;
+};

+/**

gov_cfs_select_freq - pick the next frequency for a cpu

@policy: the cpufreq policy whose frequency may be changed

gov_cfs_select_freq selects a frequency based on pelt load statistics

tracked by cfs. First it finds the most utilized cpu in the policy and then

maps that utilization value onto a cpu frequency and returns it.

Additionally, gov_cfs_select_freq adds a margin to the cpu utilization value

before converting it to a frequency. The margin is derived from MARGIN_PCT,

which itself is inspired by imbalance_pct in cfs. This is needed to

proactively increase frequency in the case of increasing load.

This approach attempts to maintain headroom of 25% unutilized cpu capacity.

A traditional way of doing this is to take 75% of the current capacity and

check if current utilization crosses that threshold. The only problem with

that approach is determining the next cpu frequency target if that threshold

is crossed.

Instead of using the 75% threshold, gov_cfs_select_freq adds a 25%

utilization margin to the utilization and converts that to a frequency. This

removes conditional logic around checking thresholds and better supports

drivers that use non-discretized frequency ranges (i.e. no pre-defined

frequency tables or operating points).

Returns frequency selected.

*/

+static unsigned long gov_cfs_select_freq(struct cpufreq_policy *policy) +{
int cpu = 0;
struct gov_data *gd;
unsigned long freq = 0, max_usage = 0, usage = 0;
if (!policy->governor_data)
        goto out;
s/goto out/return 0/

OK.

...

```
gd = policy->governor_data;
```
```
/*
```

 * get_cpu_usage is called without locking the runqueues. This is the

 * same behavior used by find_busiest_cpu in load_balance. We are

 * willing to accept occasionally stale data here in exchange for

```
 * lockless behavior.
```
```
 */
```
```
for_each_cpu(cpu, policy->cpus) {
```
```
        usage = get_cpu_usage(cpu);
```
```
        if (usage > max_usage)
```
```
                max_usage = usage;
```
```
}
```

/* add margin to max_usage based on imbalance_pct */

max_usage = max_usage * MARGIN_PCT / 100;

```
cpu = cpumask_first(policy->cpus);
```

/* freq is current utilization + 25% */

freq = max_usage * policy->max / capacity_orig_of(cpu);

Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?

The big.LITTLE case here is confusing. Is cfs.utilization_load_avg already normalized against cpu capacity differences? If so then you are right, I could use the value directly. But the if not then get_cpu_usage buys us that normalization by doing:

cfs.utilization_load_avg * capacity_orig >> SCHED_LOAD_SHIFT;

Where capacity_orig may be different across various CPUs.

...

+out:

```
return freq;
```

+/*

- we pass in struct cpufreq_policy. This is safe because changing out the
- policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- which tears down all of the data structures and __cpufreq_governor(policy,
- CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- new policy pointer
*/

+static int gov_cfs_thread(void *data) +{

```
struct sched_param param;
```
```
struct cpufreq_policy *policy;
```
```
struct gov_data *gd;
```
```
unsigned long freq;
```
```
int ret;
```

policy = (struct cpufreq_policy *) data;

```
if (!policy) {
```

        pr_warn("%s: missing policy\n", __func__);

```
        do_exit(-EINVAL);
```
```
}
```
```
gd = policy->governor_data;
```
```
if (!gd) {
```

        pr_warn("%s: missing governor data\n", __func__);

```
        do_exit(-EINVAL);
```
```
}
```
```
param.sched_priority = 50;
```

ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);

```
if (ret) {
```

        pr_warn("%s: failed to set SCHED_FIFO\n", __func__);

```
        do_exit(-EINVAL);
```
```
} else {
```

        pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",

                        __func__, gd->task->pid);

```
}
```

ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);

```
if (ret) {
```

        pr_warn("%s: failed to set allowed ptr\n", __func__);

```
        do_exit(-EINVAL);
```
```
}
```

/* main loop of the per-policy kthread */

```
do {
```

        set_current_state(TASK_INTERRUPTIBLE);

```
        schedule();
```
```
        if (kthread_should_stop())
```
```
                break;
```

        /* avoid race with gov_cfs_stop */

        if (!down_write_trylock(&policy->rwsem))

```
                continue;
```

        freq = gov_cfs_select_freq(policy);

        ret = __cpufreq_driver_target(policy, freq,

                        CPUFREQ_RELATION_H);

```
        if (ret)
```

                pr_debug("%s: __cpufreq_driver_target returned %d\n",

                                __func__, ret);

Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?

Yes, this was pointed out in another reply. I'll fix it up.

...

...
        gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
        up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
        return;
}
wake_up_process(gd->task);
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
        return;
}
if (!policy->governor_data) {
        goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
        goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
        pr_debug("%s: failed to allocate private data\n", __func__);
        return;
}
/*
 * Don't ask for freq changes at an higher rate than what
 * the driver advertises as transition latency.
 */
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                    policy->cpuinfo.transition_latency :
                    THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
          __func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.

There is no traditional wq here. Just irq_work handler + kthread.

...

I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

I am confused by how you phrased the above text. This patch uses irqwork + kthread, which you say is the way to go. But I don't use a traditional workqueue instead of irqwork.

I did implement a method that uses irqwork + wq, and it had some bugs, as well as the fact that it clearly increased cfs load tracking stats.

Thanks a lot for the review, Mike

...

...
policy->governor_data = gd;
gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
        case CPUFREQ_GOV_START:
                /* Start managing the frequency */
                gov_cfs_start(policy);
                return 0;
        case CPUFREQ_GOV_STOP:
                gov_cfs_stop(policy);
                return 0;
        case CPUFREQ_GOV_LIMITS:        /* unused */
        case CPUFREQ_GOV_POLICY_INIT:   /* unused */
        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                break;
}
return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name                   = "gov_cfs",
.governor               = gov_cfs_setup,
.owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
  update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Juri Lelli

1 May 1 May

8:50 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 01/05/15 00:49, Michael Turquette wrote:

...

Quoting Daniel Lezcano (2015-04-30 08:05:52)

...
On 04/27/2015 09:46 AM, Michael Turquette wrote:

[snip]

...

```
/*
```

 * get_cpu_usage is called without locking the runqueues. This is the

 * same behavior used by find_busiest_cpu in load_balance. We are

 * willing to accept occasionally stale data here in exchange for

```
 * lockless behavior.
```
```
 */
```
```
for_each_cpu(cpu, policy->cpus) {
```
```
        usage = get_cpu_usage(cpu);
```
```
        if (usage > max_usage)
```
```
                max_usage = usage;
```
```
}
```

/* add margin to max_usage based on imbalance_pct */

max_usage = max_usage * MARGIN_PCT / 100;

```
cpu = cpumask_first(policy->cpus);
```

/* freq is current utilization + 25% */

freq = max_usage * policy->max / capacity_orig_of(cpu);

Couldn't this be slightly simplified by using directly cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?

The big.LITTLE case here is confusing. Is cfs.utilization_load_avg already normalized against cpu capacity differences? If so then you are right, I

Nope. In this patchset utilization_load_avg is only freq invariant.

Thanks,

- Juri

...

could use the value directly. But the if not then get_cpu_usage buys us that normalization by doing:

cfs.utilization_load_avg * capacity_orig >> SCHED_LOAD_SHIFT;

Where capacity_orig may be different across various CPUs.

...
...
+out:
return freq;
+}

+/*

we pass in struct cpufreq_policy. This is safe because changing out the

policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),

which tears down all of the data structures and __cpufreq_governor(policy,

CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the

new policy pointer

*/

+static int gov_cfs_thread(void *data) +{
struct sched_param param;
struct cpufreq_policy *policy;
struct gov_data *gd;
unsigned long freq;
int ret;
policy = (struct cpufreq_policy *) data;
if (!policy) {
        pr_warn("%s: missing policy\n", __func__);
        do_exit(-EINVAL);
}
gd = policy->governor_data;
if (!gd) {
        pr_warn("%s: missing governor data\n", __func__);
        do_exit(-EINVAL);
}
param.sched_priority = 50;
ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
if (ret) {
        pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
        do_exit(-EINVAL);
} else {
        pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
                        __func__, gd->task->pid);
}
ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);
if (ret) {
        pr_warn("%s: failed to set allowed ptr\n", __func__);
        do_exit(-EINVAL);
}
/* main loop of the per-policy kthread */
do {
        set_current_state(TASK_INTERRUPTIBLE);
        schedule();
        if (kthread_should_stop())
                break;
        /* avoid race with gov_cfs_stop */
        if (!down_write_trylock(&policy->rwsem))
                continue;
        freq = gov_cfs_select_freq(policy);
        ret = __cpufreq_driver_target(policy, freq,
                        CPUFREQ_RELATION_H);
        if (ret)
                pr_debug("%s: __cpufreq_driver_target returned %d\n",
                                __func__, ret);
Shouldn't the relation be H or L depending we are increasing or decreasing the freq ?
Yes, this was pointed out in another reply. I'll fix it up.

...
...
        gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
        up_write(&policy->rwsem);
} while (!kthread_should_stop());
do_exit(0);
+}

+static void gov_cfs_irq_work(struct irq_work *irq_work) +{
struct gov_data *gd;
gd = container_of(irq_work, struct gov_data, irq_work);
if (!gd) {
        return;
}
wake_up_process(gd->task);
+}

+/**

gov_cfs_update_cpu - interface to scheduler for changing capacity values

@cpu: cpu whose capacity utilization has recently changed

gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the

scheduler may inform the governor of updates to capacity utilization and

make changes to cpu frequency. Currently this interface is designed around

PELT values in CFS. It can be expanded to other scheduling classes in the

future if needed.

gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up

the thread that does the actual work, gov_cfs_thread.

*/

+void gov_cfs_update_cpu(int cpu) +{
struct cpufreq_policy *policy;
struct gov_data *gd;
/* XXX put policy pointer in per-cpu data? */
policy = cpufreq_cpu_get(cpu);
if (IS_ERR_OR_NULL(policy)) {
        return;
}
if (!policy->governor_data) {
        goto out;
}
gd = policy->governor_data;
/* bail early if we are throttled */
if (ktime_before(ktime_get(), gd->throttle)) {
        goto out;
}
irq_work_queue_on(&gd->irq_work, cpu);
+out:
cpufreq_cpu_put(policy);
return;
+}

+static void gov_cfs_start(struct cpufreq_policy *policy) +{
struct gov_data *gd;
/* prepare per-policy private data */
gd = kzalloc(sizeof(*gd), GFP_KERNEL);
if (!gd) {
        pr_debug("%s: failed to allocate private data\n", __func__);
        return;
}
/*
 * Don't ask for freq changes at an higher rate than what
 * the driver advertises as transition latency.
 */
gd->throttle_nsec = policy->cpuinfo.transition_latency ?
                    policy->cpuinfo.transition_latency :
                    THROTTLE_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
          __func__, gd->throttle_nsec);
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.

...
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

I am confused by how you phrased the above text. This patch uses irqwork

kthread, which you say is the way to go. But I don't use a traditional

workqueue instead of irqwork.

I did implement a method that uses irqwork + wq, and it had some bugs, as well as the fact that it clearly increased cfs load tracking stats.

Thanks a lot for the review, Mike

...
...
policy->governor_data = gd;
gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
        case CPUFREQ_GOV_START:
                /* Start managing the frequency */
                gov_cfs_start(policy);
                return 0;
        case CPUFREQ_GOV_STOP:
                gov_cfs_stop(policy);
                return 0;
        case CPUFREQ_GOV_LIMITS:        /* unused */
        case CPUFREQ_GOV_POLICY_INIT:   /* unused */
        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                break;
}
return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name                   = "gov_cfs",
.governor               = gov_cfs_setup,
.owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
  update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
eas-dev mailing list eas-dev@lists.linaro.org https://lists.linaro.org/mailman/listinfo/eas-dev

Daniel Lezcano

4 May 4 May

8:05 a.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

On 05/01/2015 01:49 AM, Michael Turquette wrote:

[ ... ]

...

...
...
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.

...
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

I am confused by how you phrased the above text. This patch uses irqwork

kthread, which you say is the way to go. But I don't use a traditional

workqueue instead of irqwork.

Sorry for the confusion. I was refering to the email:

"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".

...

I did implement a method that uses irqwork + wq, and it had some bugs,

Why are you using the irqwork ?

...

as well as the fact that it clearly increased cfs load tracking stats.

Ah, interesting. Could you elaborate ?

Thanks

-- Daniel

...

...
...
policy->governor_data = gd;
gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
        case CPUFREQ_GOV_START:
                /* Start managing the frequency */
                gov_cfs_start(policy);
                return 0;
        case CPUFREQ_GOV_STOP:
                gov_cfs_stop(policy);
                return 0;
        case CPUFREQ_GOV_LIMITS:        /* unused */
        case CPUFREQ_GOV_POLICY_INIT:   /* unused */
        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                break;
}
return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name                   = "gov_cfs",
.governor               = gov_cfs_setup,
.owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
   update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Michael Turquette

3:55 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Daniel Lezcano (2015-05-04 01:05:04)

...

On 05/01/2015 01:49 AM, Michael Turquette wrote:

[ ... ]

...
...
...
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.

...
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

I am confused by how you phrased the above text. This patch uses irqwork

kthread, which you say is the way to go. But I don't use a traditional

workqueue instead of irqwork.
Sorry for the confusion. I was refering to the email:

"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".

...
I did implement a method that uses irqwork + wq, and it had some bugs,

Why are you using the irqwork ?

Good question. The bulk of the Real Work is done in the kthread. We need to wake up the kthread somehow from inside enqueue_task_fair, dequeue_task_fair and task_tick_fair. These functions hold runqueue locks and disable interrupts. We cannot call any function that might sleep or call schedule().

In order to wake the kthread we use wake_up_process. The good news is that this function does not sleep. The bad news is the calling it will re-enter the scheduler, which is fatal.

Thus we need a way to call wake_up_process AFTER we exit the critical section in the scheduler where irqs are disabled. One of the ways we handled this in previous patch sets was to hack in a callback in run_rebalance_domains, but this has two problems:

1) it is an ugly hack 2) waking up the kthread here causes an undesirable periodic behavior

Juri proposed a solution to register an irq_work callback that simply calls wake_up_process (which is safe since wake_up_process will not sleep). From within the scheduler we raise an IPI. After we exit the critical section and re-enable interrupts then we handle the IPI which wakes up the kthread.

The ideal solution would be to wake up the kthread from within the scheduler's critical section via some special case which does not cause reentry. This can be done but it is a bit over my head and might not be accepted upstream.

Regards, Mike

...

...
as well as the fact that it clearly increased cfs load tracking stats.

Ah, interesting. Could you elaborate ?

Thanks

-- Daniel

...
...
...
policy->governor_data = gd;
gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
        case CPUFREQ_GOV_START:
                /* Start managing the frequency */
                gov_cfs_start(policy);
                return 0;
        case CPUFREQ_GOV_STOP:
                gov_cfs_stop(policy);
                return 0;
        case CPUFREQ_GOV_LIMITS:        /* unused */
        case CPUFREQ_GOV_POLICY_INIT:   /* unused */
        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                break;
}
return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name                   = "gov_cfs",
.governor               = gov_cfs_setup,
.owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
   update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

Michael Turquette

4:33 p.m.

New subject: [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

Quoting Daniel Lezcano (2015-05-04 01:05:04)

...

On 05/01/2015 01:49 AM, Michael Turquette wrote:

[ ... ]

...
...
...
/* init per-policy kthread */
gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");
if (IS_ERR_OR_NULL(gd->task))
        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);
init_irq_work(&gd->irq_work, gov_cfs_irq_work);
It does not make sense to have a workqueue and a kthread, this is duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.

...
I saw also the irqwork + kthread mail you sent and I believe it is the way to go. Did you think about creating a workqueue per clock line instead of using the irqworkq ?

I am confused by how you phrased the above text. This patch uses irqwork

kthread, which you say is the way to go. But I don't use a traditional

workqueue instead of irqwork.
Sorry for the confusion. I was refering to the email:

"[Eas-dev] [PATCH] cap_gov: irq_work + workqueue".

...
I did implement a method that uses irqwork + wq, and it had some bugs,

Why are you using the irqwork ?

...
as well as the fact that it clearly increased cfs load tracking stats.

Ah, interesting. Could you elaborate ?

Oops. Forgot to answer this in my previous mail.

The kthread is currently set to use SCHED_FIFO. It is an RT task. The main reason to do this is that it receives higher priority in the runqueue and will be run BEFORE the cfs tasks.

There is also the nice side effect that the cfs governor only looks at cfs load right now. Thus the added load of doing a DVFS transition as an rt task doesn't affect the cfs load statistics and we kind of get this behavior "for free". In other words we get to avoid Heisenberg's observability principle ;-)

Of course some day if we want to start basing a dvfs decision on rt tasks stats then we will lose this behavior.

To answer your question, using a workqueue puts SCHED_OTHER tasks onto the cfs runqueues. Thus we DO see an impact to cfs load stats by doing a dvfs transition in this way.

Regards, Mike

...

Thanks

-- Daniel

...
...
...
policy->governor_data = gd;
gd->policy = policy;
+}

+static void gov_cfs_stop(struct cpufreq_policy *policy) +{
struct gov_data *gd;
gd = policy->governor_data;
kthread_stop(gd->task);
policy->governor_data = NULL;
/* FIXME replace with devm counterparts? */
kfree(gd);
+}

+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event) +{
switch (event) {
        case CPUFREQ_GOV_START:
                /* Start managing the frequency */
                gov_cfs_start(policy);
                return 0;
        case CPUFREQ_GOV_STOP:
                gov_cfs_stop(policy);
                return 0;
        case CPUFREQ_GOV_LIMITS:        /* unused */
        case CPUFREQ_GOV_POLICY_INIT:   /* unused */
        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */
                break;
}
return 0;
+}

+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS +static +#endif +struct cpufreq_governor cpufreq_gov_cfs = {
.name                   = "gov_cfs",
.governor               = gov_cfs_setup,
.owner                  = THIS_MODULE,
+};

+static int __init gov_cfs_init(void) +{
return cpufreq_register_governor(&cpufreq_gov_cfs);
+}

+static void __exit gov_cfs_exit(void) +{
cpufreq_unregister_governor(&cpufreq_gov_cfs);
+}

+/* Try to make this the default governor */ +fs_initcall(gov_cfs_init);

+MODULE_LICENSE("GPL"); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 393fc36..a7b97f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_rq_runnable_avg(rq, rq->nr_running); add_nr_running(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); }
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
 hrtick_update(rq);
}
@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr);
   update_rq_runnable_avg(rq, 1);
if(sched_energy_freq())
        gov_cfs_update_cpu(cpu_of(rq));
}

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 63a8be9..ec23523 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) int get_cpu_usage(int cpu); unsigned long capacity_orig_of(int cpu);

+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS +void gov_cfs_update_cpu(int cpu); +#else +static inline void gov_cfs_update_cpu(int cpu) {} +#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs

Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog

3290

days inactive

3299

days old

eas-dev@lists.linaro.org

36 comments

participants

tags (0)

participants (7)

Ashwin Chaugule
Daniel Lezcano
Juri Lelli
Michael Turquette
Mike Turquette
Morten Rasmussen
Vincent Guittot