On Tue, 2014-03-18 at 11:29 +0000, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/core.c | 12 +++++- kernel/sched/fair.c | 100 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 1 + 3 files changed, 101 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..2a74474 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
&& !got_nohz_idle_kick()
+#ifdef CONFIG_SCHED_HMP
&& !this_rq()->wake_for_idle_pull
+#endif
return;)
/* @@ -1434,6 +1438,12 @@ void scheduler_ipi(void) this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP
- else if (unlikely(this_rq()->wake_for_idle_pull)) {
raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
+#endif
- irq_exit();
} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e3686b..b22906c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -39,6 +39,9 @@ */ #include <linux/cpufreq.h> #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif #include "sched.h" @@ -3541,6 +3544,66 @@ static const int hmp_max_tasks = 5; extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); +struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static void hmp_keepalive_delay(unsigned int *ns_delay) +{
- struct cpuidle_driver *drv;
- drv = cpuidle_driver_ref();
Doesn't this mean that SCHED_HMP now needs to depend on CPU_IDLE? (In the KConfigs for both arm and arm64 arches.) Or, considering this is a tuning, erm... 'hack' I guess is the right word ;-) how about making this function an inline nop when CPU_IDLE is not defined?
- if (drv) {
unsigned int us_next = UINT_MAX;
unsigned int us_least = UINT_MAX;
unsigned int ns_next;
int idx;
for (idx = 0; idx < drv->state_count; idx++) {
if (drv->states[idx].target_residency < us_least) {
us_least = drv->states[idx].target_residency;
}
}
for (idx = 0; idx < drv->state_count; idx++) {
if (drv->states[idx].target_residency > us_least &&
drv->states[idx].target_residency < us_next) {
us_next = drv->states[idx].target_residency;
}
}
if (us_next)
You don't need the above check, shifting zero right one gives zero :-)
us_next>>=1;
ns_next = us_next << 10;
I know accuracy doesn't really matter, but what's wrong with *1000 rather than <<10, not like we couldn't afford the one off cost of a clock cycle or two.
Also, what is the reasoning behind picking half the time of the second lowest residency time? And, that reasoning should probably be mentioned in the code as a comment.
[...]
@@ -7199,6 +7265,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; +#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
I don't understand the scheduler enough to work out if the above is OK. Could we miss out doing needed work if both idle pull and rebalance work were triggered close in time? I'll try and study the code some more.
hmp_force_up_migration(this_cpu); rebalance_domains(this_cpu, idle);