Hi,
If slow path frequency changes are conducted in a SCHED_OTHER context then they may be delayed for some amount of time, including indefinitely, when real time or deadline activity is taking place.
Move the slow path to a real time kernel thread using the kthread worker infrastructure. In the future the thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set to 50 for now.
This was tested with Hackbench on ARM Exynos, dual core A15 platform and no regressions were seen. The third patch has more details on it.
This work was started by Steve Muckle, where he used a simple kthread instead of kthread-worker and that wasn't sufficient as some guarantees weren't met.
I was wondering if the same should be done for ondemand/conservative governors as well ?
V1->V2: - first patch is new based on Peter's suggestions. - fixed indented label - Moved kthread creation/destruction into separate routines - Used MACRO instead of magic number '50' - minor formatting, commenting and improved commit logs
-- viresh
Viresh Kumar (4): cpufreq: schedutil: Avoid indented labels cpufreq: schedutil: enable fast switch earlier cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task cpufreq: schedutil: irq-work and mutex are only used in slow path
kernel/sched/cpufreq_schedutil.c | 119 ++++++++++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 20 deletions(-)
Switch to the more common practice of writing labels.
Suggested-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..8c4e1652e895 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -454,17 +454,17 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail;
- out: +out: mutex_unlock(&global_tunables_lock);
cpufreq_enable_fast_switch(policy); return 0;
- fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables);
- free_sg_policy: +free_sg_policy: mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
The fast_switch_enabled flag will be used by both sugov_policy_alloc() and sugov_policy_free() with a later patch.
Prepare for that by moving the calls to enable and disable it to the beginning of sugov_init() and end of sugov_exit().
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8c4e1652e895..68f21bb6bd44 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -416,9 +416,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY;
+ cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + }
mutex_lock(&global_tunables_lock);
@@ -456,8 +460,6 @@ static int sugov_init(struct cpufreq_policy *policy)
out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0;
fail: @@ -468,6 +470,10 @@ static int sugov_init(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -478,8 +484,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count;
- cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -490,6 +494,7 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); }
static int sugov_start(struct cpufreq_policy *policy)
If slow path frequency changes are conducted in a SCHED_OTHER context then they may be delayed for some amount of time, including indefinitely, when real time or deadline activity is taking place.
Move the slow path to a real time kernel thread. In the future the thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set to 50 for now.
Hackbench results on ARM Exynos, dual core A15 platform for 10 iterations:
$ hackbench -s 100 -l 100 -g 10 -f 20
Before After --------------------------------- 1.808 1.603 1.847 1.251 2.229 1.590 1.952 1.600 1.947 1.257 1.925 1.627 2.694 1.620 1.258 1.621 1.919 1.632 1.250 1.240
Average:
1.8829 1.5041
Based on initial work by Steve Muckle.
Signed-off-by: Steve Muckle smuckle.linux@gmail.com Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/cpufreq_schedutil.c | 85 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 68f21bb6bd44..f165ba0f0766 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <trace/events/power.h>
#include "sched.h"
+#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy {
/* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress;
bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); }
-static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For Real Time and Deadline tasks, schedutil governor shoots the + * frequency to maximum. And special care must be taken to ensure that + * this kthread doesn't result in that. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() and before that schedutil + * rejects all other frequency scaling requests. + * + * Though there is a very rare case where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); }
/************************** sysfs interface ************************/ @@ -372,7 +391,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
sg_policy->policy = policy; init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; @@ -384,6 +402,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) kfree(sg_policy); }
+static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -424,12 +487,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto disable_fast_switch; }
+ ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + mutex_lock(&global_tunables_lock);
if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -441,7 +508,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; }
tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -466,6 +533,9 @@ static int sugov_init(struct cpufreq_policy *policy) policy->governor_data = NULL; sugov_tunables_free(tunables);
+stop_kthread: + sugov_kthread_stop(sg_policy); + free_sg_policy: mutex_unlock(&global_tunables_lock);
@@ -493,6 +563,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); } @@ -541,7 +612,7 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched();
irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + kthread_cancel_work_sync(&sg_policy->work); }
static void sugov_limits(struct cpufreq_policy *policy)
On Tue, Nov 15, 2016 at 01:53:22PM +0530, Viresh Kumar wrote:
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- /*
* For Real Time and Deadline tasks, schedutil governor shoots the
* frequency to maximum. And special care must be taken to ensure that
* this kthread doesn't result in that.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() and before that schedutil
* rejects all other frequency scaling requests.
*
* Though there is a very rare case where the RT thread yields right
* after the work_in_progress flag is cleared. The effects of that are
* neglected for now.
*/
- kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
Right, so that's a wee bit icky, but its also entirely pre-existing code.
Acked-by: Peter Zijlstra (Intel) peterz@infradead.org
On Wednesday, November 16, 2016 04:26:05 PM Peter Zijlstra wrote:
On Tue, Nov 15, 2016 at 01:53:22PM +0530, Viresh Kumar wrote:
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- /*
* For Real Time and Deadline tasks, schedutil governor shoots the
* frequency to maximum. And special care must be taken to ensure that
* this kthread doesn't result in that.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() and before that schedutil
* rejects all other frequency scaling requests.
*
* Though there is a very rare case where the RT thread yields right
* after the work_in_progress flag is cleared. The effects of that are
* neglected for now.
*/
- kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
Right, so that's a wee bit icky, but its also entirely pre-existing code.
Acked-by: Peter Zijlstra (Intel) peterz@infradead.org
Whole series applied.
Thanks, Rafael
* Viresh Kumar viresh.kumar@linaro.org wrote:
- /*
* For Real Time and Deadline tasks, schedutil governor shoots the
* frequency to maximum. And special care must be taken to ensure that
* this kthread doesn't result in that.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() and before that schedutil
* rejects all other frequency scaling requests.
*
* Though there is a very rare case where the RT thread yields right
* after the work_in_progress flag is cleared. The effects of that are
* neglected for now.
*/
s/schedutil governor/ the schedutil governor
s/And special care must be taken/ Special care must be taken
s/at the end of the sugov_work()/ at the end of the sugov_work() function
s/before that schedutil rejects/ before the schedutil governor rejects
s/Though there is a very rare case where There is a very rare case though, where
Thanks,
Ingo
On 24-11-16, 05:51, Ingo Molnar wrote:
- Viresh Kumar viresh.kumar@linaro.org wrote:
- /*
* For Real Time and Deadline tasks, schedutil governor shoots the
* frequency to maximum. And special care must be taken to ensure that
* this kthread doesn't result in that.
*
* This is (mostly) guaranteed by the work_in_progress flag. The flag is
* updated only at the end of the sugov_work() and before that schedutil
* rejects all other frequency scaling requests.
*
* Though there is a very rare case where the RT thread yields right
* after the work_in_progress flag is cleared. The effects of that are
* neglected for now.
*/
s/schedutil governor/ the schedutil governor
s/And special care must be taken/ Special care must be taken
s/at the end of the sugov_work()/ at the end of the sugov_work() function
s/before that schedutil rejects/ before the schedutil governor rejects
s/Though there is a very rare case where There is a very rare case though, where
Thanks. I will send a fix for this.
Execute the irq-work specific initialization/exit code only when the fast path isn't available.
Signed-off-by: Viresh Kumar viresh.kumar@linaro.org --- kernel/sched/cpufreq_schedutil.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f165ba0f0766..42a220e78f00 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -390,15 +390,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL;
sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; }
static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); }
@@ -432,6 +429,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
sg_policy->thread = thread; kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + wake_up_process(thread);
return 0; @@ -445,6 +445,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy)
kthread_flush_worker(&sg_policy->worker); kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); }
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -611,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work); - kthread_cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } }
static void sugov_limits(struct cpufreq_policy *policy)
Firstly, please start changes to scheduler code with a verb. This title:
Subject: Re: [PATCH V2 4/4] cpufreq: schedutil: irq-work and mutex are only used in slow path
is totally inadequate as it's a statement that says nothing about the _change_.
What does the patch do? Does it add, remove, modify, fix or clean up?
* Viresh Kumar viresh.kumar@linaro.org wrote:
Execute the irq-work specific initialization/exit code only when the fast path isn't available.
Is this an optimization? A correctness fix?
Thanks,
Ingo
On 24-11-16, 05:53, Ingo Molnar wrote:
Firstly, please start changes to scheduler code with a verb. This title:
Subject: Re: [PATCH V2 4/4] cpufreq: schedutil: irq-work and mutex are only used in slow path
is totally inadequate as it's a statement that says nothing about the _change_.
What does the patch do? Does it add, remove, modify, fix or clean up?
Thanks for the tip. I have sometimes seen similar subjects-line in patches from core developers and so thought it might be right. But yes I understand what you are saying and will take care of this in future across all subsystems.
- Viresh Kumar viresh.kumar@linaro.org wrote:
Execute the irq-work specific initialization/exit code only when the fast path isn't available.
Is this an optimization? A correctness fix?
Its an optimization but yeah I will try to explain a bit more next time.
* Viresh Kumar viresh.kumar@linaro.org wrote:
- Viresh Kumar viresh.kumar@linaro.org wrote:
Execute the irq-work specific initialization/exit code only when the fast path isn't available.
Is this an optimization? A correctness fix?
Its an optimization but yeah I will try to explain a bit more next time.
Thanks!
Ingo
Hi Viresh,
On Tue, Nov 15, 2016 at 01:53:19PM +0530, Viresh Kumar wrote:
This work was started by Steve Muckle, where he used a simple kthread instead of kthread-worker and that wasn't sufficient as some guarantees weren't met.
I know this has already gone in, but can you expand on the unmet guarantees mentioned here just for my own (and perhaps others') understanding?
thanks, Steve
On 23-11-16, 19:01, Steve Muckle wrote:
I know this has already gone in, but can you expand on the unmet guarantees mentioned here just for my own (and perhaps others') understanding?
Sure. This is the simplified form of your original patch:
@@ -71,7 +73,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns;
- if (sg_policy->work_in_progress) + if (sg_policy->thread_active) return false;
if (unlikely(sg_policy->need_freq_update)) {
+static int sugov_thread(void *data) { ...
+ do {
...
+ set_current_state(TASK_RUNNING); + /* Issue request. */ + mutex_lock(&sg_policy->slow_lock); + __cpufreq_driver_target(sg_policy->policy, + sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->slow_lock); + + sg_policy->thread_active = false;
...
+ schedule(); + + } while (!kthread_should_stop()); + return 0; }
static void sugov_irq_work(struct irq_work *irq_work) @@ -349,7 +382,7 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + wake_up_process(sg_policy->thread); }
Consider that the thread has just got a chance to run and has set 'thread_active' to false from sugov_thread(), i.e. schedule() isn't called yet.
At this time if sugov_should_update_freq() gets called again, it will return 'true', and eventually we will land into sugov_irq_work() and that will call wake_up_process(). But the process is already running and haven't slept yet. I am not sure how it works but I don't expect the thread to go to sleep again at this point of time.
And in this particular case we will end up not evaluating the load and doing DVFS for period = 2 * rate_limit_us, whereas we wanted to do that every rate_limit microseconds.
Of course a simple kthread would have been better instead of a kthread + work if this wasn't the case.
Does that sound reasonable or Am I missing something ?
linaro-kernel@lists.linaro.org