In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration. This also removes the previous difference in threshold comparison where up-migration used '!<threshold' and idle pull used '>threshold' to decide up-migration eligibility. Make them both use '!<threshold' instead for consistency, although this is unlikely to change any results.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..29e2c74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,14 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif
#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} + /* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6757,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif - if (se->avg.load_avg_ratio < hmp_up_threshold) + if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */ @@ -7237,7 +7245,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1); - if (curr->avg.load_avg_ratio > hmp_up_threshold && + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 188 +++++++++++---------------------------------------- 1 file changed, 40 insertions(+), 148 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 29e2c74..4e3686b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6214,6 +6214,7 @@ out: #ifdef CONFIG_SCHED_HMP static unsigned int hmp_idle_pull(int this_cpu); #endif +static int move_specific_task(struct lb_env *env, struct task_struct *pm); /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6273,21 +6274,17 @@ void idle_balance(int this_cpu, struct rq *this_rq) } }
-/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p;
raw_spin_lock_irq(&busiest_rq->lock); + p = busiest_rq->migrate_task;
/* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -6298,6 +6295,11 @@ static int active_load_balance_cpu_stop(void *data) if (busiest_rq->nr_running <= 1) goto out_unlock;
+ if (!check_sd_lb_flag) { + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + } /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by @@ -6311,12 +6313,14 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + if (((check_sd_lb_flag && sd->flags & SD_LOAD_BALANCE) || + !check_sd_lb_flag) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; }
if (likely(sd)) { + bool success = false; struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, @@ -6328,7 +6332,14 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env)) + if (check_sd_lb_flag) { + if (move_one_task(&env)) + success = true; + } else { + if (move_specific_task(&env, p)) + success = true; + } + if (success) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -6336,11 +6347,24 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: + if (!check_sd_lb_flag) + put_task_struct(p); busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); return 0; }
+/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6901,151 +6925,19 @@ static int move_specific_task(struct lb_env *env, struct task_struct *pm) * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to * migrate a specific task from one runqueue to another. * hmp_force_up_migration uses this to push a currently running task - * off a runqueue. - * Based on active_load_balance_stop_cpu and can potentially be merged. + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. */ static int hmp_active_task_migration_cpu_stop(void *data) { - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) { - goto out_unlock; - } - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; -} - -/* - * hmp_idle_pull_cpu_stop is run by cpu stopper and used to - * migrate a specific task from one runqueue to another. - * hmp_idle_pull uses this to push a currently running task - * off a runqueue to a faster CPU. - * Locking is slightly different than usual. - * Based on active_load_balance_stop_cpu and can potentially be merged. - */ -static int hmp_idle_pull_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; + return __do_active_load_balance_cpu_stop(data, false); }
/* * Move task in a runnable state to another CPU. * - * Tailored on 'active_load_balance_stop_cpu' with slight + * Tailored on 'active_load_balance_cpu_stop' with slight * modification to locking and pre-transfer checks. Note * rq->lock must be held before calling. */ @@ -7285,7 +7177,7 @@ static unsigned int hmp_idle_pull(int this_cpu)
if (force) { stop_one_cpu_nowait(cpu_of(target), - hmp_idle_pull_cpu_stop, + hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); } done:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
kernel/sched/fair.c: In function ‘__do_active_load_balance_cpu_stop’: kernel/sched/fair.c:6399:16: error: ‘struct rq’ has no member named ‘migrate_task’ p = busiest_rq->migrate_task; ^
Unfortunately of the the two functions you have combined, one seems needed for HMP and one not, and the new common function references HMP specific things.
On Tue, 2014-03-18 at 16:55 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
The changes below fix the build errors. Chris, are you OK for me to squash that into your patch?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e3686b..3431533 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6211,10 +6211,17 @@ out_one_pinned: out: return ld_moved; } + #ifdef CONFIG_SCHED_HMP static unsigned int hmp_idle_pull(int this_cpu); -#endif static int move_specific_task(struct lb_env *env, struct task_struct *pm); +#else +inline int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + return 0; +} +#endif + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6281,11 +6288,12 @@ static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; - struct task_struct *p; + struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock); +#ifdef CONFIG_SCHED_HMP p = busiest_rq->migrate_task; - +#endif /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance))
On Wed, 2014-03-19 at 10:00 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 16:55 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
The changes below fix the build errors. Chris, are you OK for me to squash that into your patch?
As I've not heard back, I'll keep it as a separate patch. I'm also going to assume that ARM still want these changes in the comping LSK release, so I'll put together a pull request for the LSK team. It's a bit late in the monthly release cycle now, so if they get accepted, they aren't going to get quite as much testing as they would have otherwise.
On Wed, 2014-03-19 at 16:13 +0000, Jon Medhurst (Tixy) wrote:
On Wed, 2014-03-19 at 10:00 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 16:55 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
The changes below fix the build errors. Chris, are you OK for me to squash that into your patch?
As I've not heard back, I'll keep it as a separate patch. I'm also going to assume that ARM still want these changes in the comping LSK release, so I'll put together a pull request for the LSK team.
Actually, just noticed that version 3 of the patches was titled "13.04 HMP Patches V3". So I'm now wondering if these were expecting to wait until the 13.04 release, or if that was just a typo.
So I've now changed my mind and won't send a pull request to LSK. Perhaps someone will enlighten me sometime as to what's required...
Hi Tixy,
Sorry that should have been a typo. Chris is not in today, hence not getting a response on this.
Don't we have time till tomorrow to sort out any fixes to the delivered patches from codefreeze date?
Can we wait till tomorrow to hear from Chris?
Thanks Basil Eljuse...
-----Original Message----- From: Jon Medhurst (Tixy) [mailto:tixy@linaro.org] Sent: 19 March 2014 16:34 To: Chris Redpath Cc: mark.brown@linaro.org; Robin Randhawa; Basil Eljuse; linaro-kernel@lists.linaro.org; alex.shi@linaro.org Subject: Re: [PATCH v4 2/4] sched: hmp: unify active migration code
On Wed, 2014-03-19 at 16:13 +0000, Jon Medhurst (Tixy) wrote:
On Wed, 2014-03-19 at 10:00 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 16:55 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
The changes below fix the build errors. Chris, are you OK for me to squash that into your patch?
As I've not heard back, I'll keep it as a separate patch. I'm also going to assume that ARM still want these changes in the comping LSK release, so I'll put together a pull request for the LSK team.
Actually, just noticed that version 3 of the patches was titled "13.04 HMP Patches V3". So I'm now wondering if these were expecting to wait until the 13.04 release, or if that was just a typo.
So I've now changed my mind and won't send a pull request to LSK. Perhaps someone will enlighten me sometime as to what's required...
-- Tixy
-- IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
ARM Limited, Registered office 110 Fulbourn Road, Cambridge CB1 9NJ, Registered in England & Wales, Company No: 2557590 ARM Holdings plc, Registered office 110 Fulbourn Road, Cambridge CB1 9NJ, Registered in England & Wales, Company No: 2548782
On 19/03/14 16:13, Jon Medhurst (Tixy) wrote:
On Wed, 2014-03-19 at 10:00 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 16:55 +0000, Jon Medhurst (Tixy) wrote:
On Tue, 2014-03-18 at 15:32 +0000, Chris Redpath wrote:
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Signed-off-by: Chris Redpath chris.redpath@arm.com
This gets build errors if SCHED_HMP is not configured.
The changes below fix the build errors. Chris, are you OK for me to squash that into your patch?
As I've not heard back, I'll keep it as a separate patch. I'm also going to assume that ARM still want these changes in the comping LSK release, so I'll put together a pull request for the LSK team. It's a bit late in the monthly release cycle now, so if they get accepted, they aren't going to get quite as much testing as they would have otherwise.
Hi Tixy, I'm a bit out of date with the comments so far. Let me roll up everything I can find into a new set for your review and we can go from there.
--Chris
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
--Chris
In anticipation of modifying the up_threshold handling, make all instances use the same utility fn to check if a task is eligible for up-migration. This also removes the previous difference in threshold comparison where up-migration used '!<threshold' and idle pull used '>threshold' to decide up-migration eligibility. Make them both use '!<threshold' instead for consistency, although this is unlikely to change any results.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a8808..29e2c74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6734,6 +6734,14 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } #endif
#ifdef CONFIG_SCHED_HMP +static unsigned int hmp_task_eligible_for_up_migration(struct sched_entity *se) +{ + /* below hmp_up_threshold, never eligible */ + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + return 1; +} + /* Check if task should migrate to a faster cpu */ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) { @@ -6749,7 +6757,7 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti if (p->prio >= hmp_up_prio) return 0; #endif - if (se->avg.load_avg_ratio < hmp_up_threshold) + if (!hmp_task_eligible_for_up_migration(se)) return 0;
/* Let the task load settle before doing another up migration */ @@ -7237,7 +7245,10 @@ static unsigned int hmp_idle_pull(int this_cpu) } orig = curr; curr = hmp_get_heaviest_task(curr, 1); - if (curr->avg.load_avg_ratio > hmp_up_threshold && + /* check if heaviest eligible task on this + * CPU is heavier than previous task + */ + if (hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio) { p = task_of(curr); target = rq;
The HMP active migration code is functionally identical to the CFS active migration code apart from one flag check. Share the code and make the flag check optional.
Two wrapper functions allow the flag check to be present or not.
Thanks to tixy@linaro.org for pointing out the build break and a good solution in an earlier version.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 198 +++++++++++++-------------------------------------- 1 file changed, 49 insertions(+), 149 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 29e2c74..af82120 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6211,9 +6211,17 @@ out_one_pinned: out: return ld_moved; } + #ifdef CONFIG_SCHED_HMP static unsigned int hmp_idle_pull(int this_cpu); +static int move_specific_task(struct lb_env *env, struct task_struct *pm); +#else +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + return 0; +} #endif + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -6273,22 +6281,19 @@ void idle_balance(int this_cpu, struct rq *this_rq) } }
-/* - * active_load_balance_cpu_stop is run by cpu stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) +static int __do_active_load_balance_cpu_stop(void *data, bool check_sd_lb_flag) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock); - +#ifdef CONFIG_SCHED_HMP + p = busiest_rq->migrate_task; +#endif /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) @@ -6298,6 +6303,11 @@ static int active_load_balance_cpu_stop(void *data) if (busiest_rq->nr_running <= 1) goto out_unlock;
+ if (!check_sd_lb_flag) { + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + } /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by @@ -6311,12 +6321,14 @@ static int active_load_balance_cpu_stop(void *data) /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + if (((check_sd_lb_flag && sd->flags & SD_LOAD_BALANCE) || + !check_sd_lb_flag) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; }
if (likely(sd)) { + bool success = false; struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, @@ -6328,7 +6340,14 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
- if (move_one_task(&env)) + if (check_sd_lb_flag) { + if (move_one_task(&env)) + success = true; + } else { + if (move_specific_task(&env, p)) + success = true; + } + if (success) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -6336,11 +6355,24 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: + if (!check_sd_lb_flag) + put_task_struct(p); busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); return 0; }
+/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + return __do_active_load_balance_cpu_stop(data, true); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6901,151 +6933,19 @@ static int move_specific_task(struct lb_env *env, struct task_struct *pm) * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to * migrate a specific task from one runqueue to another. * hmp_force_up_migration uses this to push a currently running task - * off a runqueue. - * Based on active_load_balance_stop_cpu and can potentially be merged. + * off a runqueue. hmp_idle_pull uses this to pull a currently + * running task to an idle runqueue. + * Reuses __do_active_load_balance_cpu_stop to actually do the work. */ static int hmp_active_task_migration_cpu_stop(void *data) { - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) { - goto out_unlock; - } - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; -} - -/* - * hmp_idle_pull_cpu_stop is run by cpu stopper and used to - * migrate a specific task from one runqueue to another. - * hmp_idle_pull uses this to push a currently running task - * off a runqueue to a faster CPU. - * Locking is slightly different than usual. - * Based on active_load_balance_stop_cpu and can potentially be merged. - */ -static int hmp_idle_pull_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - struct task_struct *p = busiest_rq->migrate_task; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - - raw_spin_lock_irq(&busiest_rq->lock); - - /* make sure the requested cpu hasn't gone down in the meantime */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* Task has migrated meanwhile, abort forced migration */ - if (task_rq(p) != busiest_rq) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - - schedstat_inc(sd, alb_count); - - if (move_specific_task(&env, p)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); - } - rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); -out_unlock: - put_task_struct(p); - busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); - return 0; + return __do_active_load_balance_cpu_stop(data, false); }
/* * Move task in a runnable state to another CPU. * - * Tailored on 'active_load_balance_stop_cpu' with slight + * Tailored on 'active_load_balance_cpu_stop' with slight * modification to locking and pre-transfer checks. Note * rq->lock must be held before calling. */ @@ -7285,7 +7185,7 @@ static unsigned int hmp_idle_pull(int this_cpu)
if (force) { stop_one_cpu_nowait(cpu_of(target), - hmp_idle_pull_cpu_stop, + hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); } done:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/core.c | 11 +++- kernel/sched/fair.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 1 + 3 files changed, 142 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..f583951 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + && !got_nohz_idle_kick() +#ifdef CONFIG_SCHED_HMP + && !this_rq()->wake_for_idle_pull +#endif + ) return;
/* @@ -1434,6 +1438,11 @@ void scheduler_ipi(void) this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP + else if (unlikely(this_rq()->wake_for_idle_pull)) + raise_softirq_irqoff(SCHED_SOFTIRQ); +#endif + irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index af82120..1929e32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -39,6 +39,9 @@ */ #include <linux/cpufreq.h> #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif
#include "sched.h"
@@ -3541,6 +3544,110 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+#ifdef CONFIG_CPU_IDLE +/* + * hmp_idle_pull: + * + * In this version we have stopped using forced up migrations when we + * detect that a task running on a little CPU should be moved to a bigger + * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced + * migration means we stop the task immediately but need to wait for the + * target CPU to wake up before we can restart the task which is being + * moved. Instead, we now wake a big CPU with an IPI and ask it to pull + * a task when ready. This allows the task to continue executing on its + * current CPU, reducing the amount of time that the task is stalled for. + * + * keepalive timers: + * + * The keepalive timer is used as a way to keep a CPU engaged in an + * idle pull operation out of idle while waiting for the source + * CPU to stop and move the task. Ideally this would not be necessary + * and we could impose a temporary zero-latency requirement on the + * current CPU, but in the current QoS framework this will result in + * all CPUs in the system being unable to enter idle states which is + * not desirable. The timer does not perform any work when it expires. + */ +struct hmp_keepalive { + bool init; + ktime_t delay; /* if zero, no need for timer */ + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +/* + * Work out if any of the idle states have an exit latency too high for us. + * ns_delay is passed in containing the max we are willing to tolerate. + * If there are none, set ns_delay to zero. + * If there are any, set ns_delay to + * ('target_residency of state with shortest too-big latency' - 1) * 1000. + */ +static void hmp_keepalive_delay(unsigned int *ns_delay) +{ + struct cpuidle_driver *drv; + drv = cpuidle_driver_ref(); + if (drv) { + unsigned int us_delay = UINT_MAX; + unsigned int us_max_delay = *ns_delay / 1000; + int idx; + /* if cpuidle states are guaranteed to be sorted we + * could stop at the first match. + */ + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].exit_latency > us_max_delay && + drv->states[idx].target_residency < us_delay) { + us_delay = drv->states[idx].target_residency; + } + } + if (us_delay == UINT_MAX) + *ns_delay = 0; /* no timer required */ + else + *ns_delay = 1000 * (us_delay - 1); + } + cpuidle_driver_unref(); +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (!keepalive->init) { + unsigned int ns_delay = 100000; /* tolerate 100usec delay */ + + hrtimer_init(&keepalive->timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + keepalive->timer.function = hmp_cpu_keepalive_notify; + + hmp_keepalive_delay(&ns_delay); + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; + } + if (ktime_to_ns(keepalive->delay)) + hrtimer_start(&keepalive->timer, + keepalive->delay, HRTIMER_MODE_REL_PINNED); +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); +} +#else /* !CONFIG_CPU_IDLE */ +static void hmp_cpu_keepalive_trigger(void) +{ +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ +} +#endif + /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3601,6 +3708,8 @@ static void hmp_offline_cpu(int cpu)
if(domain) cpumask_clear_cpu(cpu, &domain->cpus); + + hmp_cpu_keepalive_cancel(cpu); } /* * Needed to determine heaviest tasks etc. @@ -7030,7 +7139,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; - if (!curr) { + if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } @@ -7047,16 +7156,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { - if (!target->active_balance) { - get_task_struct(p); - target->push_cpu = target_cpu; - target->migrate_task = p; - got_target = 1; - trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE); - hmp_next_up_delay(&p->se, target->push_cpu); - } + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; } - if (!got_target && !target->active_balance) { + if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will @@ -7078,7 +7184,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */ - if (got_target && !target->active_balance) { + if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target); @@ -7184,6 +7290,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); @@ -7207,6 +7315,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(this_cpu)) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task; + int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
On 03/24/2014 09:47 PM, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Chris, I do not fully understand the MP feature. So correct me if I am wrong. :)
The trade off is one more reschedule interrupt, and keep big cpu alive, that cause more energy cost. So do you have data show the trade off is worthy? like, the res interrupt cost, cpu alive cost VS go to idle and be wakeup cost. or benchmark data to show we get benefit from performance/power.
As to the one more res interrupt, could we check the target's pending timer before add new one? if it has a timer in time, we can save the keep_alive interrupt.
BTW, did we check the little cpu domain to see if they are under utilize? If so, relief the big cpu load is helpful on power efficiency. I just newly idle pull for big cpu, not for little cpu.
Hi Alex,
Glad to have you looking at my code as well :)
On 28/03/14 09:14, Alex Shi wrote:
On 03/24/2014 09:47 PM, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Chris, I do not fully understand the MP feature. So correct me if I am wrong. :)
The trade off is one more reschedule interrupt, and keep big cpu alive, that cause more energy cost.
It's not really extra cost. We would have performed the reschedule anyway in order to do the migration except that previously we would have waited in the CPU stopper on the source CPU while the target CPU woke from sleep and now we continue while that happens.
Since we are always waking an idle big CPU when we make this decision, we are typically paying an idle wakeup cost each time. When running the mobile workloads we are mostly interested in, that idle wakeup is frequently a wakeup from cluster shutdown mode which can be over 1ms.
The aim of this change was to try and prevent dropped frames during hmp up-migrations caused by the execution stalling while waiting for the target CPU to become available.
The CPU keepalive is there to prevent entering deep idle states in the couple of hundred microseconds that the CPU stopper takes to run on the source CPU. It could be more logically expressed as a (very) temporary idle latency requirement, except that we cannot express such constraints for a single CPU in the kernel today.
So do you have data show the trade off is worthy? like, the res interrupt cost, cpu alive cost VS go to idle and be wakeup cost. or benchmark data to show we get benefit from performance/power.
I have traces which show the resulting improvement but it's so small that it is lost in the noise in all the benchmarks we have. Most of the benchmarks do not actually involve that much migration between clusters - typically the 'benchmark' app tasks start heavy processing and continue until complete, and with the HMP thresholds we use our lighter workloads generally migrate once or twice per operation.
We have loads of data to show that the change has no detrimental impact on any of the metrics for our benchmarked scenarios (power is largely unchanged as is performance) however I need to complete the microbenchmark I've been working on to get some numbers showing what is visible in traces.
As to the one more res interrupt, could we check the target's pending timer before add new one? if it has a timer in time, we can save the keep_alive interrupt.
We could do this, but since we are waking an idle CPU I do not expect us to ever have such a pending timer. I have wondered if it is worth canceling the pending timer event when we start executing the new task but I haven't done any measurements in that area.
BTW, did we check the little cpu domain to see if they are under utilize? If so, relief the big cpu load is helpful on power efficiency. I just newly idle pull for big cpu, not for little cpu.
We have a bunch of mechanisms in place to try never to have a situation where the load on a big CPU could be relieved by a little CPU. In the HMP patches, our little CPUs are allowed to run any tasks but the big CPUs are only allowed to run tasks where their tracked (unweighted) load is above our up-threshold. We do this by removing the scheduler's cluster balancing and replacing it at a handful of key points with a load-based decision (both task load and domain loads are used).
When big tasks become runnable, we will use a big CPU if we have an idle one but we will happily use a little CPU if all the big CPUs are in use. While a big task is running, we will move it to a big CPU if there is an idle one, and if a big CPU becomes idle it will check that there are no suitable tasks on a little CPU that could be pulled before it goes idle.
This way, we mostly avoid a situation where the big cluster is overloaded and the little cluster has spare capacity.
If we do get into a situation where we have multiple tasks resident on a big CPU (it can happen if we pull something and the original task wakes up again), we check to see if the overall progress can be better served by offloading one of the heavier tasks to a little CPU.
It is true that in really heavy load situations where you have > num_cpus busy tasks, we could do better by allowing tasks to spread according to the relative compute capacities of the cores, but this is not a situation we have seen in any mobile workload yet. Hopefully energy-aware scheduling will handle this perfectly :)
We do have a difference in the newly-idle behaviour depending upon if the CPU is big or little. Since we have disabled the cpu-level balance, CPUs do not idle pull across the clusters but only inside. This covers spreading light load and heavy load idependently and that is the end of the story for little CPUs.
big CPUs can idle pull from other big CPUS (as normal) and if that doesn't find a task, they have an additional step which will finds the heaviest-load task on any little CPU and will pull it if the task load is above the up-threshold.
--Chris
On 03/28/2014 07:25 PM, Chris Redpath wrote:
Hi Alex,
Glad to have you looking at my code as well :)
On 28/03/14 09:14, Alex Shi wrote:
On 03/24/2014 09:47 PM, Chris Redpath wrote:
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Chris, I do not fully understand the MP feature. So correct me if I am wrong. :)
The trade off is one more reschedule interrupt, and keep big cpu alive, that cause more energy cost.
It's not really extra cost. We would have performed the reschedule anyway in order to do the migration except that previously we would have waited in the CPU stopper on the source CPU while the target CPU woke from sleep and now we continue while that happens.
Since we are always waking an idle big CPU when we make this decision, we are typically paying an idle wakeup cost each time. When running the mobile workloads we are mostly interested in, that idle wakeup is frequently a wakeup from cluster shutdown mode which can be over 1ms.
The aim of this change was to try and prevent dropped frames during hmp up-migrations caused by the execution stalling while waiting for the target CPU to become available.
The CPU keepalive is there to prevent entering deep idle states in the couple of hundred microseconds that the CPU stopper takes to run on the source CPU. It could be more logically expressed as a (very) temporary idle latency requirement, except that we cannot express such constraints for a single CPU in the kernel today.
Hi Chris, I very appreciate for your so detailed explanations! And looks like your patch do a very excellent improvement on this issue.
I am just wondering if we could use a bit simple way to resolve this problem like the following patch. let the task moving destination cpu do active load balance instead of source cpu. So it could give the time to source cpu when destination waking. and don't need let destination keepalive. What's your opinions on this?
iff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7984458..f30e598 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6275,7 +6275,7 @@ more_balance: raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) { - stop_one_cpu_nowait(cpu_of(busiest), + stop_one_cpu_nowait(busiest->push_cpu, active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } @@ -7198,7 +7198,7 @@ static void hmp_force_up_migration(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) - stop_one_cpu_nowait(cpu_of(target), + stop_one_cpu_nowait(target->push_cpu, hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); } @@ -7295,7 +7295,7 @@ static unsigned int hmp_idle_pull(int this_cpu) if (force) { /* start timer to keep us awake */ hmp_cpu_keepalive_trigger(); - stop_one_cpu_nowait(cpu_of(target), + stop_one_cpu_nowait(target->push_cpu, hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); }
So do you have data show the trade off is worthy? like, the res interrupt cost, cpu alive cost VS go to idle and be wakeup cost. or benchmark data to show we get benefit from performance/power.
I have traces which show the resulting improvement but it's so small that it is lost in the noise in all the benchmarks we have. Most of the benchmarks do not actually involve that much migration between clusters
- typically the 'benchmark' app tasks start heavy processing and
continue until complete, and with the HMP thresholds we use our lighter workloads generally migrate once or twice per operation.
Is your statistic too sensitive to share in linaro-kernel ml? If not, I will be very glad to see your data. :)
We have loads of data to show that the change has no detrimental impact on any of the metrics for our benchmarked scenarios (power is largely unchanged as is performance) however I need to complete the microbenchmark I've been working on to get some numbers showing what is visible in traces.
A micro benchmark is nice to persuade community to accept it. :)
Hi Chris,
Any comments are appreciated! :)
On 04/04/2014 10:49 PM, Alex Shi wrote:
Hi Chris, I very appreciate for your so detailed explanations! And looks like your patch do a very excellent improvement on this issue.
I am just wondering if we could use a bit simple way to resolve this problem like the following patch. let the task moving destination cpu do active load balance instead of source cpu. So it could give the time to source cpu when destination waking. and don't need let destination keepalive. What's your opinions on this?
iff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7984458..f30e598 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6275,7 +6275,7 @@ more_balance: raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
stop_one_cpu_nowait(busiest->push_cpu, active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); }
@@ -7198,7 +7198,7 @@ static void hmp_force_up_migration(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags); if (force)
stop_one_cpu_nowait(cpu_of(target),
stop_one_cpu_nowait(target->push_cpu, hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); }
@@ -7295,7 +7295,7 @@ static unsigned int hmp_idle_pull(int this_cpu) if (force) { /* start timer to keep us awake */ hmp_cpu_keepalive_trigger();
stop_one_cpu_nowait(cpu_of(target),
stop_one_cpu_nowait(target->push_cpu, hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); }
So do you have data show the trade off is worthy? like, the res interrupt cost, cpu alive cost VS go to idle and be wakeup cost. or benchmark data to show we get benefit from performance/power.
I have traces which show the resulting improvement but it's so small that it is lost in the noise in all the benchmarks we have. Most of the benchmarks do not actually involve that much migration between clusters
- typically the 'benchmark' app tasks start heavy processing and
continue until complete, and with the HMP thresholds we use our lighter workloads generally migrate once or twice per operation.
Is your statistic too sensitive to share in linaro-kernel ml? If not, I will be very glad to see your data. :)
We've found a couple of issues with this patch...
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
[...]
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
[...]
+/*
- Work out if any of the idle states have an exit latency too high for us.
- ns_delay is passed in containing the max we are willing to tolerate.
- If there are none, set ns_delay to zero.
- If there are any, set ns_delay to
- ('target_residency of state with shortest too-big latency' - 1) * 1000.
- */
+static void hmp_keepalive_delay(unsigned int *ns_delay) +{
- struct cpuidle_driver *drv;
- drv = cpuidle_driver_ref();
In Linux 3.10 cpuidle_driver_ref() causes a null pointer reference if there is no cpuidle driver, as on ARMv8 models (see https://bugs.launchpad.net/linaro-stable-kernel/+bug/1304028)
That can be fixed by back-porting from mainline Linux commit 3b9c10e98021 (cpuidle: Check the result of cpuidle_get_driver() against NULL)
Also...
- if (drv) {
unsigned int us_delay = UINT_MAX;
unsigned int us_max_delay = *ns_delay / 1000;
int idx;
/* if cpuidle states are guaranteed to be sorted we
* could stop at the first match.
*/
for (idx = 0; idx < drv->state_count; idx++) {
if (drv->states[idx].exit_latency > us_max_delay &&
drv->states[idx].target_residency < us_delay) {
us_delay = drv->states[idx].target_residency;
}
}
if (us_delay == UINT_MAX)
*ns_delay = 0; /* no timer required */
else
*ns_delay = 1000 * (us_delay - 1);
- }
- cpuidle_driver_unref();
+}
We shouldn't call cpuidle_driver_unref() if cpuidle_driver_ref() returned NULL, because we didn't actually get a reference.
I propose fixing that by moving the cpuidle_driver_unref() up one line to be inside the if() clause.
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU. Also ensure that tasks with restricted affinity do not block selecting other unrestricted busy tasks.
Use the knowledge of target CPU more effectively in idle pull by passing to hmp_get_heaviest_task when we know it, otherwise only checking for general affinity matches with any of the CPUs in the bigger HMP domain.
We still need to explicitly check affinity is allowed in idle pull since if we find no match in hmp_get_heaviest_task we will return the current one, which may not be affine to the new CPU despite having high enough load. In this case, there is nothing to move.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1929e32..681a24e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3721,30 +3721,31 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu);
/* must hold runqueue lock for queue se is currently on */ static struct sched_entity *hmp_get_heaviest_task( - struct sched_entity *se, int migrate_up) + struct sched_entity *se, int target_cpu) { int num_tasks = hmp_max_tasks; struct sched_entity *max_se = se; unsigned long int max_ratio = se->avg.load_avg_ratio; const struct cpumask *hmp_target_mask = NULL; + struct hmp_domain *hmp;
- if (migrate_up) { - struct hmp_domain *hmp; - if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) - return max_se; + if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) + return max_se;
- hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); - hmp_target_mask = &hmp->cpus; + hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + if (target_cpu >= 0) { + BUG_ON(!cpumask_test_cpu(target_cpu, hmp_target_mask)); + hmp_target_mask = cpumask_of(target_cpu); } /* The currently running task is not on the runqueue */ se = __pick_first_entity(cfs_rq_of(se));
while (num_tasks && se) { if (entity_is_task(se) && - (se->avg.load_avg_ratio > max_ratio && - hmp_target_mask && - cpumask_intersects(hmp_target_mask, - tsk_cpus_allowed(task_of(se))))) { + se->avg.load_avg_ratio > max_ratio && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se)))) { max_se = se; max_ratio = se->avg.load_avg_ratio; } @@ -7153,7 +7154,7 @@ static void hmp_force_up_migration(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, -1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { cpu_rq(target_cpu)->wake_for_idle_pull = 1; @@ -7250,12 +7251,14 @@ static unsigned int hmp_idle_pull(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, this_cpu); /* check if heaviest eligible task on this * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) && - curr->avg.load_avg_ratio > ratio) { + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
Hi Chris
We are hitting the BUG_ON below when CPUs are offlined or onlined...
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote: [...]
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1929e32..681a24e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3721,30 +3721,31 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu); /* must hold runqueue lock for queue se is currently on */ static struct sched_entity *hmp_get_heaviest_task(
struct sched_entity *se, int migrate_up)
struct sched_entity *se, int target_cpu)
{ int num_tasks = hmp_max_tasks; struct sched_entity *max_se = se; unsigned long int max_ratio = se->avg.load_avg_ratio; const struct cpumask *hmp_target_mask = NULL;
- struct hmp_domain *hmp;
- if (migrate_up) {
struct hmp_domain *hmp;
if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
return max_se;
- if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
return max_se;
hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
hmp_target_mask = &hmp->cpus;
- hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
- hmp_target_mask = &hmp->cpus;
- if (target_cpu >= 0) {
BUG_ON(!cpumask_test_cpu(target_cpu, hmp_target_mask));
This BUG_ON goes off about half the time CPU's are onlined or offlined. (See https://bugs.launchpad.net/linaro-stable-kernel/+bug/1301886)
It looks like updates to hmp_target_mask can lag or lead idle-pull attempts. I don't know if it's worth trying to investigate if there's something that can or should be done to avoid that situation occurring, or just cope with it by adding something like...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7984458..38bc70b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3735,7 +3735,8 @@ static struct sched_entity *hmp_get_heaviest_task( hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); hmp_target_mask = &hmp->cpus; if (target_cpu >= 0) { - BUG_ON(!cpumask_test_cpu(target_cpu, hmp_target_mask)); + if (!cpumask_test_cpu(target_cpu, hmp_target_mask)) + return 0; hmp_target_mask = cpumask_of(target_cpu); } /* The currently running task is not on the runqueue */ @@ -7255,7 +7256,7 @@ static unsigned int hmp_idle_pull(int this_cpu) /* check if heaviest eligible task on this * CPU is heavier than previous task */ - if (hmp_task_eligible_for_up_migration(curr) && + if (curr && hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio && cpumask_test_cpu(this_cpu, tsk_cpus_allowed(task_of(curr)))) {
On Thu, 2014-04-03 at 14:32 +0100, Jon Medhurst (Tixy) wrote: [...]
It looks like updates to hmp_target_mask can lag or lead idle-pull attempts. I don't know if it's worth trying to investigate if there's something that can or should be done to avoid that situation occurring, or just cope with it by adding something like...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7984458..38bc70b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3735,7 +3735,8 @@ static struct sched_entity *hmp_get_heaviest_task( hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); hmp_target_mask = &hmp->cpus; if (target_cpu >= 0) {
BUG_ON(!cpumask_test_cpu(target_cpu, hmp_target_mask));
if (!cpumask_test_cpu(target_cpu, hmp_target_mask))
hmp_target_mask = cpumask_of(target_cpu); } /* The currently running task is not on the runqueue */return 0;
@@ -7255,7 +7256,7 @@ static unsigned int hmp_idle_pull(int this_cpu) /* check if heaviest eligible task on this * CPU is heavier than previous task */
if (hmp_task_eligible_for_up_migration(curr) &&
if (curr && hmp_task_eligible_for_up_migration(curr) && curr->avg.load_avg_ratio > ratio && cpumask_test_cpu(this_cpu, tsk_cpus_allowed(task_of(curr)))) {
Assuming I don't hear otherwise soon, I plan on going with the above change. It seems to me that BUG_ON is excessive anyway, for something we can safely and simply deal with.
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
On 24/03/14 15:33, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
OK, no problem. I won't bother changing it locally as my topic branch will remain based on the old tree and I'll get them clean with your hashes when I catch up.
Are you and Basil OK if these patches go into LSK as they are?
Yes, I think so. In the mean time I'll send a kernel image to Basil and ask him to get our automated testing running overnight tonight.
I don't know if Alex has any questions about these patches, he's been a bit quite...?
Your reviews have been great, thanks again.
--Chris
On 24/03/14 15:33, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
Hang on Tixy, I didn't notice earlier but that only boots one CPU on my board now..
Will investigate further.
--Chris
On 24/03/14 16:56, Chris Redpath wrote:
On 24/03/14 15:33, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
Hang on Tixy, I didn't notice earlier but that only boots one CPU on my board now..
Will investigate further.
--Chris
*phew* It was a DTB change issue - I hadn't updated the DTB when I needed to. I think we need to regenerate our test DTBs as well, they're not part of the lsk as we have a7 only, a15 only, mp in a7bc or a15bc mode :/
--Chris
On Mon, 2014-03-24 at 16:56 +0000, Chris Redpath wrote:
On 24/03/14 15:33, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
Hang on Tixy, I didn't notice earlier but that only boots one CPU on my board now..
Will investigate further.
Sure, no rush, let make sure it's right. Though it worked OK for me when I tried it, and I just double checked it again. (I pushed out my test branch to: https://git.linaro.org/people/tixy/kernel.git/shortlog/refs/heads/test-mp-ne...)
And the new patches, apart from the whitespace and a s/inline/static/ were the same as the ones I was testing at the end of last week, on which I ran the functional scheduler tests OK as well as basic manual testing of Android, which included watching the pretty LEDs on TC2 and motherboard to make sure CPU and cluster activity looks sane.
Hi Tixy,
Just want to confirm whether the patches from Chris are in the 14.03 RC or not.
The reason I am asking is to ensure we send an updated release note if this is indeed included in 14.03 LSK. Please confirm.
Thanks Basil Eljuse...
-----Original Message----- From: Jon Medhurst (Tixy) [mailto:tixy@linaro.org] Sent: 24 March 2014 17:21 To: Chris Redpath Cc: mark.brown@linaro.org; Robin Randhawa; Basil Eljuse; linaro-kernel@lists.linaro.org; alex.shi@linaro.org Subject: Re: Idle Pull etc. for LSK v5
On Mon, 2014-03-24 at 16:56 +0000, Chris Redpath wrote:
On 24/03/14 15:33, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
Hang on Tixy, I didn't notice earlier but that only boots one CPU on my board now..
Will investigate further.
Sure, no rush, let make sure it's right. Though it worked OK for me when I tried it, and I just double checked it again. (I pushed out my test branch to: https://git.linaro.org/people/tixy/kernel.git/shortlog/refs/heads/test-mp-ne...)
And the new patches, apart from the whitespace and a s/inline/static/ were the same as the ones I was testing at the end of last week, on which I ran the functional scheduler tests OK as well as basic manual testing of Android, which included watching the pretty LEDs on TC2 and motherboard to make sure CPU and cluster activity looks sane.
-- Tixy
-- IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
ARM Limited, Registered office 110 Fulbourn Road, Cambridge CB1 9NJ, Registered in England & Wales, Company No: 2557590 ARM Holdings plc, Registered office 110 Fulbourn Road, Cambridge CB1 9NJ, Registered in England & Wales, Company No: 2548782
On 03/24/2014 11:33 PM, Jon Medhurst (Tixy) wrote:
On Mon, 2014-03-24 at 13:47 +0000, Chris Redpath wrote:
Hi Tixy, this is the current version with the comments I remember so far.
What's new: Added your build fail fix in patch 2 Removed implicit dependency on CPU_IDLE from patch 3.
Thanks Chris, looks good to me, and I have no more comments on this patch series, well, patch three has a tab in hmp_keepalive_delay() where they should be a space, I'll fix that myself before I apply the patches to the big.LITTLE-MP topic branch.
Are you and Basil OK if these patches go into LSK as they are?
I don't know if Alex has any questions about these patches, he's been a bit quite...?
this patchset looks fine to me.
When a normal forced up-migration takes place we stop the task to be migrated while the target CPU becomes available. This delay can range from 80us to 1500us on TC2 if the target CPU is in a deep idle state.
Instead, interrupt the target CPU and ask it to pull a task. This lets the current eligible task continue executing on the original CPU while the target CPU wakes. Use a pinned timer to prevent the pulling CPU going back into power-down with pending up-migrations.
If we trigger for a nohz kick, it doesn't matter about triggering for an idle pull since the idle_pull flag will be set when we execute the softirq and we'll still do the idle pull.
If the target CPU is busy, we will not pull any tasks.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/core.c | 11 +++- kernel/sched/fair.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 1 + 3 files changed, 142 insertions(+), 12 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index de9d360..f583951 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1407,7 +1407,11 @@ void scheduler_ipi(void) { if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + && !got_nohz_idle_kick() +#ifdef CONFIG_SCHED_HMP + && !this_rq()->wake_for_idle_pull +#endif + ) return;
/* @@ -1434,6 +1438,11 @@ void scheduler_ipi(void) this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } +#ifdef CONFIG_SCHED_HMP + else if (unlikely(this_rq()->wake_for_idle_pull)) + raise_softirq_irqoff(SCHED_SOFTIRQ); +#endif + irq_exit(); }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e3686b..9b62b34 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -39,6 +39,9 @@ */ #include <linux/cpufreq.h> #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#ifdef CONFIG_SCHED_HMP +#include <linux/cpuidle.h> +#endif
#include "sched.h"
@@ -3541,6 +3544,110 @@ static const int hmp_max_tasks = 5;
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+#ifdef CONFIG_CPU_IDLE +/* + * hmp_idle_pull: + * + * In this version we have stopped using forced up migrations when we + * detect that a task running on a little CPU should be moved to a bigger + * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced + * migration means we stop the task immediately but need to wait for the + * target CPU to wake up before we can restart the task which is being + * moved. Instead, we now wake a big CPU with an IPI and ask it to pull + * a task when ready. This allows the task to continue executing on its + * current CPU, reducing the amount of time that the task is stalled for. + * + * keepalive timers: + * + * The keepalive timer is used as a way to keep a CPU engaged in an + * idle pull operation out of idle while waiting for the source + * CPU to stop and move the task. Ideally this would not be necessary + * and we could impose a temporary zero-latency requirement on the + * current CPU, but in the current QoS framework this will result in + * all CPUs in the system being unable to enter idle states which is + * not desirable. The timer does not perform any work when it expires. + */ +struct hmp_keepalive { + bool init; + ktime_t delay; /* if zero, no need for timer */ + struct hrtimer timer; +}; +DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive); + +/* setup per-cpu keepalive timers */ +static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer) +{ + return HRTIMER_NORESTART; +} + +/* + * Work out if any of the idle states have an exit latency too high for us. + * ns_delay is passed in containing the max we are willing to tolerate. + * If there are none, set ns_delay to zero. + * If there are any, set ns_delay to + * ('target_residency of state with shortest too-big latency' - 1) * 1000. + */ +static void hmp_keepalive_delay(unsigned int *ns_delay) +{ + struct cpuidle_driver *drv; + drv = cpuidle_driver_ref(); + if (drv) { + unsigned int us_delay = UINT_MAX; + unsigned int us_max_delay = *ns_delay / 1000; + int idx; + /* if cpuidle states are guaranteed to be sorted we + * could stop at the first match. + */ + for (idx = 0; idx < drv->state_count; idx++) { + if (drv->states[idx].exit_latency > us_max_delay && + drv->states[idx].target_residency < us_delay) { + us_delay = drv->states[idx].target_residency; + } + } + if (us_delay == UINT_MAX) + *ns_delay = 0; /* no timer required */ + else + *ns_delay = 1000 * (us_delay - 1); + } + cpuidle_driver_unref(); +} + +static void hmp_cpu_keepalive_trigger(void) +{ + int cpu = smp_processor_id(); + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (!keepalive->init) { + unsigned int ns_delay = 100000; /* tolerate 100usec delay */ + + hrtimer_init(&keepalive->timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + keepalive->timer.function = hmp_cpu_keepalive_notify; + + hmp_keepalive_delay(&ns_delay); + keepalive->delay = ns_to_ktime(ns_delay); + keepalive->init = true; + } + if (ktime_to_ns(keepalive->delay)) + hrtimer_start(&keepalive->timer, + keepalive->delay, HRTIMER_MODE_REL_PINNED); +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ + struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu); + if (keepalive->init) + hrtimer_cancel(&keepalive->timer); +} +#else /* !CONFIG_CPU_IDLE */ +static void hmp_cpu_keepalive_trigger(void) +{ +} + +static void hmp_cpu_keepalive_cancel(int cpu) +{ +} +#endif + /* Setup hmp_domains */ static int __init hmp_cpu_mask_setup(void) { @@ -3601,6 +3708,8 @@ static void hmp_offline_cpu(int cpu)
if(domain) cpumask_clear_cpu(cpu, &domain->cpus); + + hmp_cpu_keepalive_cancel(cpu); } /* * Needed to determine heaviest tasks etc. @@ -7022,7 +7131,7 @@ static void hmp_force_up_migration(int this_cpu) target = cpu_rq(cpu); raw_spin_lock_irqsave(&target->lock, flags); curr = target->cfs.curr; - if (!curr) { + if (!curr || target->active_balance) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } @@ -7039,16 +7148,13 @@ static void hmp_force_up_migration(int this_cpu) curr = hmp_get_heaviest_task(curr, 1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { - if (!target->active_balance) { - get_task_struct(p); - target->push_cpu = target_cpu; - target->migrate_task = p; - got_target = 1; - trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE); - hmp_next_up_delay(&p->se, target->push_cpu); - } + cpu_rq(target_cpu)->wake_for_idle_pull = 1; + raw_spin_unlock_irqrestore(&target->lock, flags); + spin_unlock(&hmp_force_migration); + smp_send_reschedule(target_cpu); + return; } - if (!got_target && !target->active_balance) { + if (!got_target) { /* * For now we just check the currently running task. * Selecting the lightest task for offloading will @@ -7070,7 +7176,7 @@ static void hmp_force_up_migration(int this_cpu) * is not currently running move it, otherwise let the * CPU stopper take care of it. */ - if (got_target && !target->active_balance) { + if (got_target) { if (!task_running(target, p)) { trace_sched_hmp_migrate_force_running(p, 0); hmp_migrate_runnable_task(target); @@ -7176,6 +7282,8 @@ static unsigned int hmp_idle_pull(int this_cpu) raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) { + /* start timer to keep us awake */ + hmp_cpu_keepalive_trigger(); stop_one_cpu_nowait(cpu_of(target), hmp_active_task_migration_cpu_stop, target, &target->active_balance_work); @@ -7199,6 +7307,18 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP + /* shortcut for hmp idle pull wakeups */ + if (unlikely(this_rq->wake_for_idle_pull)) { + this_rq->wake_for_idle_pull = 0; + if (hmp_idle_pull(this_cpu)) { + /* break out unless running nohz idle as well */ + if (idle != CPU_IDLE) + return; + } + } +#endif + hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 989c5ae..0d19ede 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -466,6 +466,7 @@ struct rq { struct cpu_stop_work active_balance_work; #ifdef CONFIG_SCHED_HMP struct task_struct *migrate_task; + int wake_for_idle_pull; #endif /* cpu of this runqueue: */ int cpu;
When looking for a task to be idle-pulled, don't consider tasks where the affinity does not allow that task to be placed on the target CPU. Also ensure that tasks with restricted affinity do not block selecting other unrestricted busy tasks.
Use the knowledge of target CPU more effectively in idle pull by passing to hmp_get_heaviest_task when we know it, otherwise only checking for general affinity matches with any of the CPUs in the bigger HMP domain.
We still need to explicitly check affinity is allowed in idle pull since if we find no match in hmp_get_heaviest_task we will return the current one, which may not be affine to the new CPU despite having high enough load. In this case, there is nothing to move.
Signed-off-by: Chris Redpath chris.redpath@arm.com --- kernel/sched/fair.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b62b34..24b5b31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3721,30 +3721,31 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu);
/* must hold runqueue lock for queue se is currently on */ static struct sched_entity *hmp_get_heaviest_task( - struct sched_entity *se, int migrate_up) + struct sched_entity *se, int target_cpu) { int num_tasks = hmp_max_tasks; struct sched_entity *max_se = se; unsigned long int max_ratio = se->avg.load_avg_ratio; const struct cpumask *hmp_target_mask = NULL; + struct hmp_domain *hmp;
- if (migrate_up) { - struct hmp_domain *hmp; - if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) - return max_se; + if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) + return max_se;
- hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); - hmp_target_mask = &hmp->cpus; + hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); + hmp_target_mask = &hmp->cpus; + if (target_cpu >= 0) { + BUG_ON(!cpumask_test_cpu(target_cpu, hmp_target_mask)); + hmp_target_mask = cpumask_of(target_cpu); } /* The currently running task is not on the runqueue */ se = __pick_first_entity(cfs_rq_of(se));
while (num_tasks && se) { if (entity_is_task(se) && - (se->avg.load_avg_ratio > max_ratio && - hmp_target_mask && - cpumask_intersects(hmp_target_mask, - tsk_cpus_allowed(task_of(se))))) { + se->avg.load_avg_ratio > max_ratio && + cpumask_intersects(hmp_target_mask, + tsk_cpus_allowed(task_of(se)))) { max_se = se; max_ratio = se->avg.load_avg_ratio; } @@ -7145,7 +7146,7 @@ static void hmp_force_up_migration(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, -1); p = task_of(curr); if (hmp_up_migration(cpu, &target_cpu, curr)) { cpu_rq(target_cpu)->wake_for_idle_pull = 1; @@ -7242,12 +7243,14 @@ static unsigned int hmp_idle_pull(int this_cpu) } } orig = curr; - curr = hmp_get_heaviest_task(curr, 1); + curr = hmp_get_heaviest_task(curr, this_cpu); /* check if heaviest eligible task on this * CPU is heavier than previous task */ if (hmp_task_eligible_for_up_migration(curr) && - curr->avg.load_avg_ratio > ratio) { + curr->avg.load_avg_ratio > ratio && + cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(task_of(curr)))) { p = task_of(curr); target = rq; ratio = curr->avg.load_avg_ratio;
linaro-kernel@lists.linaro.org