In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..1b9be59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,13 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif
#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} /* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6756,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif - if (se->avg.load_avg_ratio < hmp_up_threshold) + if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */ @@ -7237,7 +7244,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1); - if (curr->avg.load_avg_ratio > hmp_up_threshold && + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 188 +++++++++++---------------------------------------- 1 file changed, 40 insertions(+), 148 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b9be59..a5cae4a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6214,6 +6214,7 @@ out: #ifdef CONFIG_SCHED_HMP static unsigned int hmp_idle_pull(int this_cpu); #endif +static int move_specific_task(struct lb_env *env, struct task_struct *pm); /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6273,21 +6274,17 @@ void idle_balance(int this_cpu, struct rq *this_rq) } }
-/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p;
raw_spin_lock_irq(&busiest_rq->lock); + p = busiest_rq->migrate_task;
/* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -6298,6 +6295,11 @@ static int active_load_balance_cpu_stop(void *data) if (busiest_rq->nr_running <= 1) goto out_unlock;
+ if (!check_sd_lb_flag) { + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + } /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by @@ -6311,12 +6313,14 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + if (((check_sd_lb_flag && sd->flags & SD_LOAD_BALANCE) || + !check_sd_lb_flag) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; }
if (likely(sd)) { + bool success = false; struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, @@ -6328,7 +6332,14 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env)) + if (check_sd_lb_flag) { + if (move_one_task(&env)) + success = true; + } else { + if (move_specific_task(&env, p)) + success = true; + } + if (success) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -6336,11 +6347,24 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: + if (!check_sd_lb_flag) + put_task_struct(p); busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); return 0; }
+/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6900,151 +6924,19 @@ static int move_specific_task(struct lb_env *env, struct task_struct *pm) * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to * migrate a specific task from one runqueue to another. * hmp_force_up_migration uses this to push a currently running task - * off a runqueue. - * Based on active_load_balance_stop_cpu and can potentially be merged. + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. */ static int hmp_active_task_migration_cpu_stop(void *data) { - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) { - goto out_unlock; - } - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; -} - -/* - * hmp_idle_pull_cpu_stop is run by cpu stopper and used to - * migrate a specific task from one runqueue to another. - * hmp_idle_pull uses this to push a currently running task - * off a runqueue to a faster CPU. - * Locking is slightly different than usual. - * Based on active_load_balance_stop_cpu and can potentially be merged. - */ -static int hmp_idle_pull_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; + return __do_active_load_balance_cpu_stop(data, false); }
/* * Move task in a runnable state to another CPU. * - * Tailored on 'active_load_balance_stop_cpu' with slight + * Tailored on 'active_load_balance_cpu_stop' with slight * modification to locking and pre-transfer checks. Note * rq->lock must be held before calling. */ @@ -7284,7 +7176,7 @@ static unsigned int hmp_idle_pull(int this_cpu)
if (force) { stop_one_cpu_nowait(cpu_of(target), - hmp_idle_pull_cpu_stop, + hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); } done:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
Code looks functionally identical to what went before and uses only one function rather than three copies, so that's good. I could bike-shed a bit about the coding style, but will resist :-)
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/core.c | 18 +++++++++++- kernel/sched/fair.c | 74 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 81 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..89e49e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + && !got_nohz_idle_kick() +#ifdef CONFIG_SCHED_HMP + && !this_rq()->wake_for_idle_pull +#endif + ) return;
/* @@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP + { + bool nohz = got_nohz_idle_kick(); + if (unlikely(this_rq()->wake_for_idle_pull) || + unlikely(nohz)) { + if (unlikely(nohz)) + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } + } +#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive { + bool init; + ktime_t delay; + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{ + struct hrtimer *hrtimer; + unsigned int ns_delay = 1000000; + if (keepalive->init) + return; + /* make sure we only init for the right CPU */ + WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id())); + + hrtimer = &keepalive->timer; + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + hrtimer->function = hmp_cpu_keepalive_notify; + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + hmp_cpu_keepalive_init(keepalive, cpu); + hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED); +} + /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus); + + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); } /* * Needed to determine heaviest tasks etc. @@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; - if (!curr) { + if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } @@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { - if (!target->active_balance) { - get_task_struct(p); - target->push_cpu = target_cpu; - target->migrate_task = p; - got_target = 1; - trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE); - hmp_next_up_delay(&p->se, target->push_cpu); - } + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; } - if (!got_target && !target->active_balance) { + if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will @@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */ - if (got_target && !target->active_balance) { + if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target); @@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); @@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(this_cpu)) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task; + int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
Hi all,
Apologies for the missing coverletter. These are the patches Basil mentioned earlier today, all review comments gratefully received :)
I have just spotted a bug, so there might be a fix for that coming along tomorrow. I don't think it's critical yet, it can probably wait if we need to.
--Chris
On 13/03/14 17:19, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/core.c | 18 +++++++++++- kernel/sched/fair.c | 74 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 81 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..89e49e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
&& !got_nohz_idle_kick()
+#ifdef CONFIG_SCHED_HMP
&& !this_rq()->wake_for_idle_pull
+#endif
)
return;
/*
@@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP
- {
bool nohz = got_nohz_idle_kick();
if (unlikely(this_rq()->wake_for_idle_pull) ||
unlikely(nohz)) {
if (unlikely(nohz))
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
- }
+#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{
- struct hrtimer *hrtimer;
- unsigned int ns_delay = 1000000;
- if (keepalive->init)
return;
- /* make sure we only init for the right CPU */
- WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));
- hrtimer = &keepalive->timer;
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hrtimer->function = hmp_cpu_keepalive_notify;
- keepalive->delay = ns_to_ktime(ns_delay);
- keepalive->init = true;
+}
+static void hmp_cpu_keepalive_trigger(void) +{
- int cpu = smp_processor_id();
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
- hmp_cpu_keepalive_init(keepalive, cpu);
- hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
- /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) {
@@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus);
if (keepalive->init)
hrtimer_cancel(&keepalive->timer);
} /*
- Needed to determine heaviest tasks etc.
@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr;
if (!curr) {
}if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue;
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
}return;
if (!got_target && !target->active_balance) {
if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will
@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */
if (got_target && !target->active_balance) {
if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target);
@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) {
/* start timer to keep us awake */
stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work);hmp_cpu_keepalive_trigger();
@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task;
- int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d59e0de..df28dbf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7178,7 +7178,8 @@ static unsigned int hmp_idle_pull(int this_cpu) * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) && - curr->avg.load_avg_ratio > ratio) { + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
On Thu, 2014-03-13 at 17:47 +0000, Chris Redpath wrote:
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d59e0de..df28dbf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7178,7 +7178,8 @@ static unsigned int hmp_idle_pull(int this_cpu) * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) &&
curr->avg.load_avg_ratio > ratio) {
curr->avg.load_avg_ratio > ratio &&
cpumask_test_cpu(this_cpu, tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
If this change is needed, would not a similar change also be required in hmp_force_up_migration at the point where we changed it in the last patch to do the wake_for_idle_pull = 1 bit?
On 14/03/14 14:12, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:47 +0000, Chris Redpath wrote:
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d59e0de..df28dbf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7178,7 +7178,8 @@ static unsigned int hmp_idle_pull(int this_cpu) * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) &&
curr->avg.load_avg_ratio > ratio) {
curr->avg.load_avg_ratio > ratio &&
cpumask_test_cpu(this_cpu, tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
If this change is needed, would not a similar change also be required in hmp_force_up_migration at the point where we changed it in the last patch to do the wake_for_idle_pull = 1 bit?
I don't believe we need it. When we are in hmp_force_up_migration the affinity is taken into account when deciding if a task can be up-migrated. This happens at
hmp_force_up_migration->hmp_up_migration->hmp_domain_min_load
when we return the target CPU. Note that hmp_domain_min_load returns the lowest loaded CPU in an hmp domain amongst the CPUs in the provided cpumask (if provided). force_up_migration provides the task affinity of the biggest task.
In the problem area we are just doing an idle pull inspection because a big CPU has gone idle. This doesn't use hmp_domain_min_load so we need to manage affinity separately.
There could be a potential problem in that if we have multiple large tasks on the same CPU to choose between and the running one was pinned, we would not be able to see the other one. However, hmp_get_heaviest_task already knows if the task selection is for an up-migration and will only return a task which has both enough load and is allowed to run on a bigger CPU. The risk is that we select a task which is able to move to a bigger CPU, but explicitly disallowed on the one we're going to move it to. I haven't seen such a situation, but nothing prevents it.
I can fix that by changing the second parameter for hmp_get_heaviest_task to take a CPU that the task must be allowed to move to. Both callers of this fn currently ask that the heaviest task is checked for affinity to at least some of the bigger domain, all I need to do is pass the CPU we are on from the idle pull caller.
--Chris
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/core.c | 18 +++++++++++- kernel/sched/fair.c | 74 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/sched.h | 1 + 3 files changed, 81 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..89e49e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id())
&& !got_nohz_idle_kick())
&& !got_nohz_idle_kick()
+#ifdef CONFIG_SCHED_HMP
&& !this_rq()->wake_for_idle_pull
+#endif
return;)
/* @@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP
- {
bool nohz = got_nohz_idle_kick();
if (unlikely(this_rq()->wake_for_idle_pull) ||
unlikely(nohz)) {
if (unlikely(nohz))
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
- }
+#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif
Wouldn't the above change have been much simpler as:
if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP + else if (unlikely(this_rq()->wake_for_idle_pull) + raise_softirq_irqoff(SCHED_SOFTIRQ); +#endif
That would also avoid having the comment about checking for 'nohz idle load balance' associated with the HMP code which also does the idle pull check.
irq_exit(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5; extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); +struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{
- struct hrtimer *hrtimer;
- unsigned int ns_delay = 1000000;
Is there any significance to a 1ms delay? Should the value be related to the scheduler tick, or idle wakeup latency? Or anything which needs to be different on different platforms?
If it's idle related, I notice this figure is the same as the target_residency for the TC2 driver, and before when people have had timers set for this period things sit on a knife edge as to whether a given version of a kernel will decide that it has enough time to go idle, or not. Which can give dramatically different behaviour from build to build and week to week.
- if (keepalive->init)
return;
- /* make sure we only init for the right CPU */
- WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));
To be nit-picky, this WARN seems a little redundant given that the only caller (below) passes in exactly the parameter the function checks for; and I can't help wonder if this init function can't just be part of hmp_cpu_keepalive_trigger(). Noe of this is important enough to warrant changing the code though.
- hrtimer = &keepalive->timer;
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hrtimer->function = hmp_cpu_keepalive_notify;
- keepalive->delay = ns_to_ktime(ns_delay);
- keepalive->init = true;
+}
+static void hmp_cpu_keepalive_trigger(void) +{
- int cpu = smp_processor_id();
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
- hmp_cpu_keepalive_init(keepalive, cpu);
- hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
/* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus);
- if (keepalive->init)
hrtimer_cancel(&keepalive->timer);
} /*
- Needed to determine heaviest tasks etc.
@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr;
if (!curr) {
}if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue;
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
}
if (!got_target && !target->active_balance) {
if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will
@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */
if (got_target && !target->active_balance) {
if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target);
@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags); if (force) {
/* start timer to keep us awake */
stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work);hmp_cpu_keepalive_trigger();
@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; +#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
- hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task;
- int wake_for_idle_pull;
#endif /* cpu of this runqueue: */ int cpu;
Hi Tixy,
On 14/03/14 13:58, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
@@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP
- {
bool nohz = got_nohz_idle_kick();
if (unlikely(this_rq()->wake_for_idle_pull) ||
unlikely(nohz)) {
if (unlikely(nohz))
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
- }
+#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif
Wouldn't the above change have been much simpler as:
if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP
- else if (unlikely(this_rq()->wake_for_idle_pull)
raise_softirq_irqoff(SCHED_SOFTIRQ);
+#endif
That would also avoid having the comment about checking for 'nohz idle load balance' associated with the HMP code which also does the idle pull check.
I like this change, I was originally intending to do something different here and ended up removing it but I didn't go back and clean this up properly.
I will issue a respin of this one early next week because of the stuff below, so I'll incorporate this.
irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5cae4a..d59e0de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3541,6 +3541,43 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive {
- bool init;
- ktime_t delay;
- struct hrtimer timer;
+}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{
- return HRTIMER_NORESTART;
+}
+static inline void hmp_cpu_keepalive_init(struct hmp_keepalive *keepalive, int cpu) +{
- struct hrtimer *hrtimer;
- unsigned int ns_delay = 1000000;
Is there any significance to a 1ms delay? Should the value be related to the scheduler tick, or idle wakeup latency? Or anything which needs to be different on different platforms?
If it's idle related, I notice this figure is the same as the target_residency for the TC2 driver, and before when people have had timers set for this period things sit on a knife edge as to whether a given version of a kernel will decide that it has enough time to go idle, or not. Which can give dramatically different behaviour from build to build and week to week.
It is related to idle in that if there are no pending events on the big CPU it will go back to power-down state again in the small gap (~100usec on TC2) between the big CPU signalling its now awake by pulling a task and the task actually being pushed off the little CPU.
I was aiming for a value which would keep the system alive but be definitely longer than the time it takes to do the push. In an ideal world I'd be able to specify a zero latency requirement on the target CPUs and clear it when the timer expires which would guarantee the effect, but there isn't a way to express that in the kernel at the moment hence the crude timer mechanism.
Do you think it would be worth fetching the cpuidle target residency of state 1 and setting the timer for MAX(150usec, state[1]->exit_latency/2)?
I already provided storage for the timer duration which I initialize only once in case this might be needed but I wasn't too keen on integrating cpuidle here unless I really need to. It sounds like you think I do :)
I'll prototype something for the respin.
- if (keepalive->init)
return;
- /* make sure we only init for the right CPU */
- WARN_ON(keepalive != &per_cpu(hmp_cpu_keepalive, smp_processor_id()));
To be nit-picky, this WARN seems a little redundant given that the only caller (below) passes in exactly the parameter the function checks for; and I can't help wonder if this init function can't just be part of hmp_cpu_keepalive_trigger(). Noe of this is important enough to warrant changing the code though.
This is another leftover from an earlier version where the initialization was not done as-needed but on CPU hotplug so I could avoid any checking in the timer arming path. I'll remove it in the respin - probably just roll it into the trigger as you suggest.
- hrtimer = &keepalive->timer;
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hrtimer->function = hmp_cpu_keepalive_notify;
- keepalive->delay = ns_to_ktime(ns_delay);
- keepalive->init = true;
+}
+static void hmp_cpu_keepalive_trigger(void) +{
- int cpu = smp_processor_id();
- struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
- hmp_cpu_keepalive_init(keepalive, cpu);
- hrtimer_start(&keepalive->timer, keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
- /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) {
@@ -3598,9 +3635,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus);
if (keepalive->init)
hrtimer_cancel(&keepalive->timer);
} /*
- Needed to determine heaviest tasks etc.
@@ -7021,7 +7062,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr;
if (!curr) {
}if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue;
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
I intended this to work exactly as it is now, and rely upon the frequent calls to force_up_migration to cover all the CPUs because it was easier than handling the fight over runqueue access which happens when we wake multiple big CPUs and ask them each to find the biggest task to pull.
After having thought about your comment perhaps it's less optimal than it could be and I can probably do better.
As a first solution, I could instead count the number of eligible tasks and wake MIN(num_of_big_cpus, num_of_eligible_tasks) big CPUs. I can change idle pull to wait for the migration spinlock rather than exit if it can't take it. That will allow the big CPUs to sync themselves and each task to be pulled.
}
if (!got_target && !target->active_balance) {
if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will
@@ -7069,7 +7107,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */
if (got_target && !target->active_balance) {
if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target);
@@ -7175,6 +7213,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) {
/* start timer to keep us awake */
stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work);hmp_cpu_keepalive_trigger();
@@ -7198,6 +7238,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP
- /* shortcut for hmp idle pull wakeups */
- if (unlikely(this_rq->wake_for_idle_pull)) {
this_rq->wake_for_idle_pull = 0;
if (hmp_idle_pull(this_cpu)) {
/* break out unless running nohz idle as well */
if (idle != CPU_IDLE)
return;
}
- }
+#endif
hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task;
- int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
On Fri, 2014-03-14 at 16:09 +0000, Chris Redpath wrote:
On 14/03/14 13:58, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
[...]
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
I intended this to work exactly as it is now, and rely upon the frequent calls to force_up_migration to cover all the CPUs because it was easier than handling the fight over runqueue access which happens when we wake multiple big CPUs and ask them each to find the biggest task to pull.
After having thought about your comment perhaps it's less optimal than it could be and I can probably do better.
As a first solution, I could instead count the number of eligible tasks and wake MIN(num_of_big_cpus, num_of_eligible_tasks) big CPUs. I can change idle pull to wait for the migration spinlock rather than exit if it can't take it. That will allow the big CPUs to sync themselves and each task to be pulled.
My gut reaction is that having CPU's waiting on spin-locks doesn't sound too efficient. In practice, how likely is needing to migrate more than one task anyway?
I don't have a good grasp of the big picture, and only tend to think of these things once a month or so when I see new patches arrive, so my first thoughts are usually just to ask more questions, here's some more... :-)
If you woke up multiple cores, how would you get each core to pick up the right task? Could you not have a situation where little cores saw saw these possible migrations:
Task 1 (affine to both CPU A and B) --> CPU A Task 2 (affine to CPU B )--> CPU B
But CPU B ran first, picked Task 1 and then when CPU A gets the lock only sees Task 2 to migrate, but it's affinity doesn't allow that.
Come to think of it, is it possible for big CPUs to be busy with other things and the little cores to go through the migrate process twice before the big CPU has acted on the first request? Does wake_for_idle_pull need to be a counter?
Or does none of this really matter, as this migration is best effort, and the odd missed migration opportunity is OK as it will be retried again soon anyway if the task still needs migration?
If the latter, perhaps the code is OK as it is in these patches, i.e. only finding one task at a time to migrate and waking one big CPU.
On 14/03/14 17:16, Jon Medhurst (Tixy) wrote:
On Fri, 2014-03-14 at 16:09 +0000, Chris Redpath wrote:
On 14/03/14 13:58, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
[...]
@@ -7038,16 +7079,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) {
if (!target->active_balance) {
get_task_struct(p);
target->push_cpu = target_cpu;
target->migrate_task = p;
got_target = 1;
trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
hmp_next_up_delay(&p->se, target->push_cpu);
}
cpu_rq(target_cpu)->wake_for_idle_pull = 1;
raw_spin_unlock_irqrestore(&target->lock, flags);
spin_unlock(&hmp_force_migration);
smp_send_reschedule(target_cpu);
return;
If I've followed the code correctly, does this change now mean that a CPU stops considering up migrations after it found this one, whereas before it would continue to look for others as well. Do we want to still do that? Guess it might be too complicated if we're now letting the other CPU pull things in it's own time. Or, if we were in the situation before where two tasks needed up migrating, would the new pull method actually find both of those to pull anyway and so behaviour is the same as before?
I intended this to work exactly as it is now, and rely upon the frequent calls to force_up_migration to cover all the CPUs because it was easier than handling the fight over runqueue access which happens when we wake multiple big CPUs and ask them each to find the biggest task to pull.
After having thought about your comment perhaps it's less optimal than it could be and I can probably do better.
As a first solution, I could instead count the number of eligible tasks and wake MIN(num_of_big_cpus, num_of_eligible_tasks) big CPUs. I can change idle pull to wait for the migration spinlock rather than exit if it can't take it. That will allow the big CPUs to sync themselves and each task to be pulled.
My gut reaction is that having CPU's waiting on spin-locks doesn't sound too efficient. In practice, how likely is needing to migrate more than one task anyway?
Yes, waiting is not great but it's not that likely with mobile workloads. IMO it's one of those cases where you can construct a test case but you're unlikely to hit it in real life.
I don't have a good grasp of the big picture, and only tend to think of these things once a month or so when I see new patches arrive, so my first thoughts are usually just to ask more questions, here's some more... :-)
If you woke up multiple cores, how would you get each core to pick up the right task? Could you not have a situation where little cores saw saw these possible migrations:
Task 1 (affine to both CPU A and B) --> CPU A Task 2 (affine to CPU B )--> CPU B
But CPU B ran first, picked Task 1 and then when CPU A gets the lock only sees Task 2 to migrate, but it's affinity doesn't allow that.
Indeed, this is one of the reasons I wanted to leave it alone in the initial patch :) There are other potentially awkward scenarios as well.
Come to think of it, is it possible for big CPUs to be busy with other things and the little cores to go through the migrate process twice before the big CPU has acted on the first request? Does wake_for_idle_pull need to be a counter?
CPUs won't idle pull if they are busy, the request will be ignored and the little core will be left to get on with its business.
Or does none of this really matter, as this migration is best effort, and the odd missed migration opportunity is OK as it will be retried again soon anyway if the task still needs migration?
This is a pretty good description of the situation - usually we'll get another opportunity fairly soon, often in <10ms on another CPU in the cluster.
It's also the case for any migration that there could be loads of reasons why you can't complete the migration when you come to do it - the task could have moved elsewhere when you get to run the stopper, it could have gone to sleep or died. Really all migrations are only best-effort and eventually the system is balanced :)
I think previously we might have attempted to set up multiple migrations, but we couldn't have performed multiple tasks moving from a single CPU to different ones at the same time even if we were able to set that up.
If the latter, perhaps the code is OK as it is in these patches, i.e. only finding one task at a time to migrate and waking one big CPU.
I'm still enjoying thinking about it :) One at a time is much simpler and we do need the bugfix to avoid a pinned task obscuring an unpinned one. I'm tempted to send you a respin with still one-at-a-time idle pulls but I haven't had testing time yet so it will have to be next week.
--Chris
On Fri, 2014-03-14 at 17:40 +0000, Chris Redpath wrote:
On 14/03/14 17:16, Jon Medhurst (Tixy) wrote:
[...]
Come to think of it, is it possible for big CPUs to be busy with other things and the little cores to go through the migrate process twice before the big CPU has acted on the first request? Does wake_for_idle_pull need to be a counter?
CPUs won't idle pull if they are busy, the request will be ignored
Does 'busy' include running a long running interrupt service routine just after it enables IRQs on coming out of idle? A CPU can be 'busy' even it there are not an tasks scheduled on it. I'm only raising things like this as something that code needs to be robust against, i.e. no oopses or deadlocks, not as something that necessarily effects normal use-case efficiency.
[...]
If the latter, perhaps the code is OK as it is in these patches, i.e. only finding one task at a time to migrate and waking one big CPU.
I'm still enjoying thinking about it :) One at a time is much simpler and we do need the bugfix to avoid a pinned task obscuring an unpinned one. I'm tempted to send you a respin with still one-at-a-time idle pulls but I haven't had testing time yet so it will have to be next week.
Sure, I won't push anything towards the direction of LSK for now.
On Fri, 2014-03-14 at 18:04 +0000, Jon Medhurst (Tixy) wrote:
On Fri, 2014-03-14 at 17:40 +0000, Chris Redpath wrote:
On 14/03/14 17:16, Jon Medhurst (Tixy) wrote:
If the latter, perhaps the code is OK as it is in these patches, i.e. only finding one task at a time to migrate and waking one big CPU.
I'm still enjoying thinking about it :) One at a time is much simpler and we do need the bugfix to avoid a pinned task obscuring an unpinned one. I'm tempted to send you a respin with still one-at-a-time idle pulls but I haven't had testing time yet so it will have to be next week.
Sure, I won't push anything towards the direction of LSK for now.
Just in case I wasn't clear, that means there are currently no new MP patches in the 13.03 LSK release. If you are testing revised versions that you were hoping to get into the release it would be good to get them reviewed now rather than waiting for any testing to finish, otherwise they are likely not going to make it into the release. Of course, there's no practical reason why patches can't go into LSK next week after a good review and test, but I know some people get fixated on the monthly release cycle rather than do things as a when they're ready.
On 18/03/14 09:36, Jon Medhurst (Tixy) wrote:
On Fri, 2014-03-14 at 18:04 +0000, Jon Medhurst (Tixy) wrote:
On Fri, 2014-03-14 at 17:40 +0000, Chris Redpath wrote:
On 14/03/14 17:16, Jon Medhurst (Tixy) wrote:
If the latter, perhaps the code is OK as it is in these patches, i.e. only finding one task at a time to migrate and waking one big CPU.
I'm still enjoying thinking about it :) One at a time is much simpler and we do need the bugfix to avoid a pinned task obscuring an unpinned one. I'm tempted to send you a respin with still one-at-a-time idle pulls but I haven't had testing time yet so it will have to be next week.
Sure, I won't push anything towards the direction of LSK for now.
Just in case I wasn't clear, that means there are currently no new MP patches in the 13.03 LSK release. If you are testing revised versions that you were hoping to get into the release it would be good to get them reviewed now rather than waiting for any testing to finish, otherwise they are likely not going to make it into the release. Of course, there's no practical reason why patches can't go into LSK next week after a good review and test, but I know some people get fixated on the monthly release cycle rather than do things as a when they're ready.
No problem Tixy, I'm still tweaking and not sitting about waiting for a test cycle :)
I'll push out where I am and we can talk about those.
--Chris
Hi Tixy, these are the current state of the patches for your review. Obviously, there is a bit of squash and rebase required - 5 and 6 will probably be best rolled into patch 3, but I wanted you to see them separately first.
--Chris
[PATCH 1/6] hmp: sched: Clean up hmp_up_threshold checks into a [PATCH 2/6] sched: hmp: unify active migration code [PATCH 3/6] hmp: Use idle pull to perform forced up-migrations [PATCH 4/6] hmp: dont attempt to pull tasks if affinity doesn't [PATCH 5/6] sched: hmp: select keepalive delay based upon idle [PATCH 6/6] sched: hmp: simplify logic around triggering resched for
In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration. This also removes the previous difference in threshold comparison where up-migration used '!<threshold' and idle pull used '>threshold' to decide up-migration eligibility. Make them both use '!<threshold' instead for consistency, although this is unlikely to change any results.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..29e2c74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,14 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif
#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} + /* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6757,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif - if (se->avg.load_avg_ratio < hmp_up_threshold) + if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */ @@ -7237,7 +7245,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1); - if (curr->avg.load_avg_ratio > hmp_up_threshold && + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 188 +++++++++++---------------------------------------- 1 file changed, 40 insertions(+), 148 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 29e2c74..4e3686b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6214,6 +6214,7 @@ out: #ifdef CONFIG_SCHED_HMP static unsigned int hmp_idle_pull(int this_cpu); #endif +static int move_specific_task(struct lb_env *env, struct task_struct *pm); /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6273,21 +6274,17 @@ void idle_balance(int this_cpu, struct rq *this_rq) } }
-/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p;
raw_spin_lock_irq(&busiest_rq->lock); + p = busiest_rq->migrate_task;
/* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -6298,6 +6295,11 @@ static int active_load_balance_cpu_stop(void *data) if (busiest_rq->nr_running <= 1) goto out_unlock;
+ if (!check_sd_lb_flag) { + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + } /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by @@ -6311,12 +6313,14 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + if (((check_sd_lb_flag && sd->flags & SD_LOAD_BALANCE) || + !check_sd_lb_flag) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; }
if (likely(sd)) { + bool success = false; struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, @@ -6328,7 +6332,14 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env)) + if (check_sd_lb_flag) { + if (move_one_task(&env)) + success = true; + } else { + if (move_specific_task(&env, p)) + success = true; + } + if (success) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -6336,11 +6347,24 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: + if (!check_sd_lb_flag) + put_task_struct(p); busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); return 0; }
+/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6901,151 +6925,19 @@ static int move_specific_task(struct lb_env *env, struct task_struct *pm) * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to * migrate a specific task from one runqueue to another. * hmp_force_up_migration uses this to push a currently running task - * off a runqueue. - * Based on active_load_balance_stop_cpu and can potentially be merged. + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. */ static int hmp_active_task_migration_cpu_stop(void *data) { - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) { - goto out_unlock; - } - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; -} - -/* - * hmp_idle_pull_cpu_stop is run by cpu stopper and used to - * migrate a specific task from one runqueue to another. - * hmp_idle_pull uses this to push a currently running task - * off a runqueue to a faster CPU. - * Locking is slightly different than usual. - * Based on active_load_balance_stop_cpu and can potentially be merged. - */ -static int hmp_idle_pull_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; + return __do_active_load_balance_cpu_stop(data, false); }
/* * Move task in a runnable state to another CPU. * - * Tailored on 'active_load_balance_stop_cpu' with slight + * Tailored on 'active_load_balance_cpu_stop' with slight * modification to locking and pre-transfer checks. Note * rq->lock must be held before calling. */ @@ -7285,7 +7177,7 @@ static unsigned int hmp_idle_pull(int this_cpu)
if (force) { stop_one_cpu_nowait(cpu_of(target), - hmp_idle_pull_cpu_stop, + hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); } done:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/core.c | 18 +++++++++- kernel/sched/fair.c | 95 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 1 + 3 files changed, 102 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..89e49e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + && !got_nohz_idle_kick() +#ifdef CONFIG_SCHED_HMP + && !this_rq()->wake_for_idle_pull +#endif + ) return;
/* @@ -1430,10 +1434,22 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ +#ifdef CONFIG_SCHED_HMP + { + bool nohz = got_nohz_idle_kick(); + if (unlikely(this_rq()->wake_for_idle_pull) || + unlikely(nohz)) { + if (unlikely(nohz)) + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } + } +#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#endif irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e3686b..df849c6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -39,6 +39,9 @@ */ #include <linux/cpufreq.h> #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif
#include "sched.h"
@@ -3541,6 +3544,61 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+struct hmp_keepalive { + bool init; + ktime_t delay; + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +static void hmp_keepalive_delay(unsigned int *delay) +{ + struct cpuidle_driver *drv; + drv = cpuidle_driver_ref(); + if (drv) { + int least=0, next=0, idx; + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].target_residency < + drv->states[least].target_residency) { + least = idx; + next = idx; + } else if (drv->states[idx].target_residency < + drv->states[next].target_residency) { + next = idx; + } + } + *delay = drv->states[next].target_residency; + if (*delay) + *delay = *delay - 1; + printk("keepalive: use duration %u for cpu%d\n", *delay, smp_processor_id()); + } + cpuidle_driver_unref(); +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (!keepalive->init) { + unsigned int ns_delay = 1000000; + + hrtimer_init(&keepalive->timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + keepalive->timer.function = hmp_cpu_keepalive_notify; + hmp_keepalive_delay(&ns_delay); + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; + } + hrtimer_start(&keepalive->timer, + keepalive->delay, HRTIMER_MODE_REL_PINNED); +} + /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3598,9 +3656,13 @@ static void hmp_online_cpu(int cpu) static void hmp_offline_cpu(int cpu) { struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
if(domain) cpumask_clear_cpu(cpu, &domain->cpus); + + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); } /* * Needed to determine heaviest tasks etc. @@ -7022,7 +7084,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; - if (!curr) { + if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } @@ -7039,16 +7101,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { - if (!target->active_balance) { - get_task_struct(p); - target->push_cpu = target_cpu; - target->migrate_task = p; - got_target = 1; - trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE); - hmp_next_up_delay(&p->se, target->push_cpu); - } + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; } - if (!got_target && !target->active_balance) { + if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will @@ -7070,7 +7129,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */ - if (got_target && !target->active_balance) { + if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target); @@ -7176,6 +7235,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); @@ -7199,6 +7260,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(this_cpu)) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task; + int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU. Also ensure that tasks with restricted affinity do not block selecting other unrestricted busy tasks.
Use the knowledge of target CPU more effectively in idle pull by passing to hmp_get_heaviest_task when we know it, otherwise only checking for general affinity matches with any of the CPUs in the bigger HMP domain.
We still need to explicitly check affinity is allowed in idle pull since if we find no match in hmp_get_heaviest_task we will return the current one, which may not be affine to the new CPU despite having high enough load. In this case, there is nothing to move.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df849c6..da45671 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3674,30 +3674,30 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu);
/* must hold runqueue lock for queue se is currently on */ static struct sched_entity *hmp_get_heaviest_task( - struct sched_entity *se, int migrate_up) + struct sched_entity *se, int target_cpu) { int num_tasks = hmp_max_tasks; struct sched_entity *max_se = se; unsigned long int max_ratio = se->avg.load_avg_ratio; const struct cpumask *hmp_target_mask = NULL; + struct hmp_domain *hmp;
- if (migrate_up) { - struct hmp_domain *hmp; - if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) - return max_se; + if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) + return max_se; + + hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus;
- hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); - hmp_target_mask = &hmp->cpus; - } /* The currently running task is not on the runqueue */ se = __pick_first_entity(cfs_rq_of(se));
while (num_tasks && se) { if (entity_is_task(se) && - (se->avg.load_avg_ratio > max_ratio && - hmp_target_mask && - cpumask_intersects(hmp_target_mask, - tsk_cpus_allowed(task_of(se))))) { + se->avg.load_avg_ratio > max_ratio && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se))) && + target_cpu > -1 ? cpumask_test_cpu(target_cpu, + tsk_cpus_allowed(task_of(se))) : 1) { max_se = se; max_ratio = se->avg.load_avg_ratio; } @@ -7098,7 +7098,7 @@ static void hmp_force_up_migration(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, -1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { cpu_rq(target_cpu)->wake_for_idle_pull = 1; @@ -7195,12 +7195,14 @@ static unsigned int hmp_idle_pull(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, this_cpu); /* check if heaviest eligible task on this * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) && - curr->avg.load_avg_ratio > ratio) { + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da45671..431c3c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3557,26 +3557,31 @@ static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) return HRTIMER_NORESTART; }
-static void hmp_keepalive_delay(unsigned int *delay) +static void hmp_keepalive_delay(unsigned int *ns_delay) { struct cpuidle_driver *drv; drv = cpuidle_driver_ref(); if (drv) { - int least=0, next=0, idx; + unsigned int us_next = UINT_MAX; + unsigned int us_least = UINT_MAX; + unsigned int ns_next; + int idx; for (idx = 0; idx < drv->state_count; idx++) { - if (drv->states[idx].target_residency < - drv->states[least].target_residency) { - least = idx; - next = idx; - } else if (drv->states[idx].target_residency < - drv->states[next].target_residency) { - next = idx; + if (drv->states[idx].target_residency < us_least) { + us_least = drv->states[idx].target_residency; } } - *delay = drv->states[next].target_residency; - if (*delay) - *delay = *delay - 1; - printk("keepalive: use duration %u for cpu%d\n", *delay, smp_processor_id()); + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].target_residency > us_least && + drv->states[idx].target_residency < us_next) { + us_next = drv->states[idx].target_residency; + } + } + if (us_next) + us_next>>=1; + ns_next = us_next << 10; + *ns_delay = *ns_delay < ns_next ? *ns_delay : ns_next; + printk("keepalive: use duration %u for cpu%d\n", *ns_delay, smp_processor_id()); } cpuidle_driver_unref(); }
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will remain set when we execute the softirq. It's simpler to layout the code this way.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/core.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e49e1..2a74474 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1434,22 +1434,16 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ -#ifdef CONFIG_SCHED_HMP - { - bool nohz = got_nohz_idle_kick(); - if (unlikely(this_rq()->wake_for_idle_pull) || - unlikely(nohz)) { - if (unlikely(nohz)) - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - } -#else if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP + else if (unlikely(this_rq()->wake_for_idle_pull)) { + raise_softirq_irqoff(SCHED_SOFTIRQ); + } #endif + irq_exit(); }
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..1b9be59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,13 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif #ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{
- /* below hmp_up_threshold, never eligible */
- if (se->avg.load_avg_ratio < hmp_up_threshold)
return 0;
- return 1;
+}
Guess there should be a blank line added here as well? (I can do that before adding this patch to the MP topic branch), but I have another comment below about a possible bug...
/* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6756,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif
- if (se->avg.load_avg_ratio < hmp_up_threshold)
- if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */ @@ -7237,7 +7244,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1);
if (curr->avg.load_avg_ratio > hmp_up_threshold &&
/* check if heaviest eligible task on this
* CPU is heavier than previous task
*/
if (hmp_task_eligible_for_up_migration(curr) &&
This isn't strictly a direct replacement of like-for-like because you've defectively replaced:
curr->avg.load_avg_ratio > hmp_up_threshold
with !(se->avg.load_avg_ratio < hmp_up_threshold)
which gives different results when the values compared are equal, I don't know if that makes much practical difference? In fact, new use possibly make more sense, as before there was a situation where hmp_force_up_migration could make different decisions to hmp_idle_pull. (I'm not particularly familiar with the code so I may not be talking sense.)
curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
Thanks Tixy, I really appreciate you looking at these.
On 14/03/14 09:58, Jon Medhurst (Tixy) wrote:
On Thu, 2014-03-13 at 17:19 +0000, Chris Redpath wrote:
In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration.
Signed-off-by: Chris Redpath chris.redpath@arm.com
kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..1b9be59 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,13 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif
#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{
- /* below hmp_up_threshold, never eligible */
- if (se->avg.load_avg_ratio < hmp_up_threshold)
return 0;
- return 1;
+}
Guess there should be a blank line added here as well? (I can do that before adding this patch to the MP topic branch), but I have another comment below about a possible bug...
Yes, please add a line there.
/* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6756,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif
- if (se->avg.load_avg_ratio < hmp_up_threshold)
if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */
@@ -7237,7 +7244,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1);
if (curr->avg.load_avg_ratio > hmp_up_threshold &&
/* check if heaviest eligible task on this
* CPU is heavier than previous task
*/
if (hmp_task_eligible_for_up_migration(curr) &&
This isn't strictly a direct replacement of like-for-like because you've defectively replaced:
curr->avg.load_avg_ratio > hmp_up_threshold
with !(se->avg.load_avg_ratio < hmp_up_threshold)
which gives different results when the values compared are equal, I don't know if that makes much practical difference? In fact, new use possibly make more sense, as before there was a situation where hmp_force_up_migration could make different decisions to hmp_idle_pull. (I'm not particularly familiar with the code so I may not be talking sense.)
Well spotted :) This is intentional, and your comment makes perfect sense. Putting the decision in a fn means I get the opportunity to express the logical intent in the name, which I like to do. As you spotted, the existing code behaved in both ways - up migration was aborted if the load was < threshold and idle pull only attempted if load was > threshold. There is clearly a difference in what load == threshold means here.
In all the documentation we imply that a task should be eligible for up-migration when we reach the threshold rather than pass it, and given that up-migration is the primary cause of big CPU use I thought that was the right model to choose.
In practise, I don't expect any difference at all :) If an idle pull was missed because the load was only ==threshold, it would likely have been up-migrated at or around the next scheduler tick anyway.
curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
On Fri, 2014-03-14 at 11:21 +0000, Chris Redpath wrote:
On 14/03/14 09:58, Jon Medhurst (Tixy) wrote:
[...]
This isn't strictly a direct replacement of like-for-like because you've defectively replaced:
curr->avg.load_avg_ratio > hmp_up_threshold
with !(se->avg.load_avg_ratio < hmp_up_threshold)
which gives different results when the values compared are equal, I don't know if that makes much practical difference? In fact, new use possibly make more sense, as before there was a situation where hmp_force_up_migration could make different decisions to hmp_idle_pull. (I'm not particularly familiar with the code so I may not be talking sense.)
Well spotted :) This is intentional, and your comment makes perfect sense.
In which case it might be good to have the change explained in the commit message. Care to come up with some wording? I've not finished looking at the other patches, and we not had any feedback from Alex Shi about the patches yet, so I suggest waiting first to see if the patches needed updated and resending.
Cheers
Hi Tixy,
Another respin. [PATCH v4 1/4] hmp: sched: Clean up hmp_up_threshold checks into a [PATCH v4 2/4] sched: hmp: unify active migration code [PATCH v4 3/4] hmp: Use idle pull to perform forced up-migrations [PATCH v4 4/4] hmp: dont attempt to pull tasks if affinity doesn't
linaro-kernel@lists.linaro.org