On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/core.c | 18 +++++++++++- kernel/sched/fair.c | 74 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 81 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..89e49e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
&& !got_nohz_idle_kick()
+#ifdef CONFIG_SCHED_HMP
&& !this_rq()->wake_for_idle_pull
+#endif
return;)
/* @@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP
- {
bool nohz = got_nohz_idle_kick();
if (unlikely(this_rq()->wake_for_idle_pull) ||
unlikely(nohz)) {
if (unlikely(nohz))
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
- }
+#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif
Wouldn't the above change have been much simpler as:
if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP + else if (unlikely(this_rq()->wake_for_idle_pull) + raise_softirq_irqoff(SCHED_SOFTIRQ); +#endif
That would also avoid having the comment about checking for 'nohz idle load balance' associated with the HMP code which also does the idle pull check.
irq_exit(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5; extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); +struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{
- struct hrtimer *hrtimer;
- unsigned int ns_delay = 1000000;
Is there any significance to a 1ms delay? Should the value be related to the scheduler tick, or idle wakeup latency? Or anything which needs to be different on different platforms?
If it's idle related, I notice this figure is the same as the target_residency for the TC2 driver, and before when people have had timers set for this period things sit on a knife edge as to whether a given version of a kernel will decide that it has enough time to go idle, or not. Which can give dramatically different behaviour from build to build and week to week.
- if (keepalive->init)
return;
- /* make sure we only init for the right CPU */
- WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));
To be nit-picky, this WARN seems a little redundant given that the only caller (below) passes in exactly the parameter the function checks for; and I can't help wonder if this init function can't just be part of hmp_cpu_keepalive_trigger(). Noe of this is important enough to warrant changing the code though.
- hrtimer = &keepalive->timer;
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hrtimer->function = hmp_cpu_keepalive_notify;
- keepalive->delay = ns_to_ktime(ns_delay);
- keepalive->init = true;
+}
+static void hmp_cpu_keepalive_trigger(void) +{
- int cpu = smp_processor_id();
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
- hmp_cpu_keepalive_init(keepalive, cpu);
- hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
/* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus);
- if (keepalive->init)
hrtimer_cancel(&keepalive->timer);
} /*
- Needed to determine heaviest tasks etc.
@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr;
if (!curr) {
}if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue;
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
}
if (!got_target && !target->active_balance) {
if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will
@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */
if (got_target && !target->active_balance) {
if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target);
@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags); if (force) {
/* start timer to keep us awake */
stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work);hmp_cpu_keepalive_trigger();
@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; +#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
- hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task;
- int wake_for_idle_pull;
#endif /* cpu of this runqueue: */ int cpu;