Re: [Eas-dev] [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling

1 May 2015

On 01/05/15 00:49, Michael Turquette wrote:
...
Quoting Daniel Lezcano (2015-04-30 08:05:52)
...
On 04/27/2015 09:46 AM, Michael Turquette wrote:
[snip]
...
...
...

/*


 * get_cpu_usage is called without locking the runqueues. This is the


 * same behavior used by find_busiest_cpu in load_balance. We are


 * willing to accept occasionally stale data here in exchange for


 * lockless behavior.


 */


for_each_cpu(cpu, policy->cpus) {


        usage = get_cpu_usage(cpu);


        if (usage > max_usage)


                max_usage = usage;


}



/* add margin to max_usage based on imbalance_pct */


max_usage = max_usage * MARGIN_PCT / 100;



cpu = cpumask_first(policy->cpus);



/* freq is current utilization + 25% */


freq = max_usage * policy->max / capacity_orig_of(cpu);



Couldn't this be slightly simplified by using directly
cpu_rq(cpu)->cfs.utilization_load_avg instead of calling get_cpu_usage ?
The big.LITTLE case here is confusing. Is cfs.utilization_load_avg already
normalized against cpu capacity differences? If so then you are right, I
Nope. In this patchset utilization_load_avg is only freq invariant.
Thanks,
- Juri
...
could use the value directly. But the if not then get_cpu_usage buys us
that normalization by doing:
cfs.utilization_load_avg * capacity_orig >> SCHED_LOAD_SHIFT;
Where capacity_orig may be different across various CPUs.
...
...



+out:

return freq;



+}



+/*


we pass in struct cpufreq_policy. This is safe because changing out the



policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),



which tears down all of the data structures and __cpufreq_governor(policy,



CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the



new policy pointer


*/

+static int gov_cfs_thread(void *data)
+{

struct sched_param param;


struct cpufreq_policy *policy;


struct gov_data *gd;


unsigned long freq;


int ret;



policy = (struct cpufreq_policy *) data;


if (!policy) {


        pr_warn("%s: missing policy\n", __func__);


        do_exit(-EINVAL);


}



gd = policy->governor_data;


if (!gd) {


        pr_warn("%s: missing governor data\n", __func__);


        do_exit(-EINVAL);


}



param.sched_priority = 50;


ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);


if (ret) {


        pr_warn("%s: failed to set SCHED_FIFO\n", __func__);


        do_exit(-EINVAL);


} else {


        pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",


                        __func__, gd->task->pid);


}



ret = set_cpus_allowed_ptr(gd->task, policy->related_cpus);


if (ret) {


        pr_warn("%s: failed to set allowed ptr\n", __func__);


        do_exit(-EINVAL);


}



/* main loop of the per-policy kthread */


do {


        set_current_state(TASK_INTERRUPTIBLE);


        schedule();


        if (kthread_should_stop())


                break;



        /* avoid race with gov_cfs_stop */


        if (!down_write_trylock(&policy->rwsem))


                continue;



        freq = gov_cfs_select_freq(policy);



        ret = __cpufreq_driver_target(policy, freq,


                        CPUFREQ_RELATION_H);


        if (ret)


                pr_debug("%s: __cpufreq_driver_target returned %d\n",


                                __func__, ret);



Shouldn't the relation be H or L depending we are increasing or
decreasing the freq ?
Yes, this was pointed out in another reply. I'll fix it up.
...
...


        gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);


        up_write(&policy->rwsem);


} while (!kthread_should_stop());



do_exit(0);



+}



+static void gov_cfs_irq_work(struct irq_work *irq_work)
+{

struct gov_data *gd;



gd = container_of(irq_work, struct gov_data, irq_work);


if (!gd) {


        return;


}



wake_up_process(gd->task);



+}



+/**


gov_cfs_update_cpu - interface to scheduler for changing capacity values



@cpu: cpu whose capacity utilization has recently changed







gov_cfs_udpate_cpu is an interface exposed to the scheduler so that the



scheduler may inform the governor of updates to capacity utilization and



make changes to cpu frequency. Currently this interface is designed around



PELT values in CFS. It can be expanded to other scheduling classes in the



future if needed.







gov_cfs_update_cpu raises an IPI. The irq_work handler for that IPI wakes up



the thread that does the actual work, gov_cfs_thread.


*/

+void gov_cfs_update_cpu(int cpu)
+{

struct cpufreq_policy *policy;


struct gov_data *gd;



/* XXX put policy pointer in per-cpu data? */


policy = cpufreq_cpu_get(cpu);


if (IS_ERR_OR_NULL(policy)) {


        return;


}



if (!policy->governor_data) {


        goto out;


}



gd = policy->governor_data;



/* bail early if we are throttled */


if (ktime_before(ktime_get(), gd->throttle)) {


        goto out;


}



irq_work_queue_on(&gd->irq_work, cpu);




+out:

cpufreq_cpu_put(policy);


return;



+}



+static void gov_cfs_start(struct cpufreq_policy *policy)
+{

struct gov_data *gd;



/* prepare per-policy private data */


gd = kzalloc(sizeof(*gd), GFP_KERNEL);


if (!gd) {


        pr_debug("%s: failed to allocate private data\n", __func__);


        return;


}



/*


 * Don't ask for freq changes at an higher rate than what


 * the driver advertises as transition latency.


 */


gd->throttle_nsec = policy->cpuinfo.transition_latency ?


                    policy->cpuinfo.transition_latency :


                    THROTTLE_NSEC;


pr_debug("%s: throttle threshold = %u [ns]\n",


          __func__, gd->throttle_nsec);



/* init per-policy kthread */


gd->task = kthread_run(gov_cfs_thread, policy, "kgov_cfs_task");


if (IS_ERR_OR_NULL(gd->task))


        pr_err("%s: failed to create kgov_cfs_task thread\n", __func__);



init_irq_work(&gd->irq_work, gov_cfs_irq_work);



It does not make sense to have a workqueue and a kthread, this is
duplicating what the workqueue already does.
There is no traditional wq here. Just irq_work handler + kthread.
...
I saw also the irqwork + kthread mail you sent and I believe it is the
way to go. Did you think about creating a workqueue per clock line
instead of using the irqworkq ?
I am confused by how you phrased the above text. This patch uses irqwork

kthread, which you say is the way to go. But I don't use a traditional

workqueue instead of irqwork.
I did implement a method that uses irqwork + wq, and it had some bugs,
as well as the fact that it clearly increased cfs load tracking stats.
Thanks a lot for the review,
Mike
...
...

policy->governor_data = gd;


gd->policy = policy;



+}



+static void gov_cfs_stop(struct cpufreq_policy *policy)
+{

struct gov_data *gd;



gd = policy->governor_data;


kthread_stop(gd->task);



policy->governor_data = NULL;



/* FIXME replace with devm counterparts? */


kfree(gd);



+}



+static int gov_cfs_setup(struct cpufreq_policy *policy, unsigned int event)
+{

switch (event) {


        case CPUFREQ_GOV_START:


                /* Start managing the frequency */


                gov_cfs_start(policy);


                return 0;



        case CPUFREQ_GOV_STOP:


                gov_cfs_stop(policy);


                return 0;



        case CPUFREQ_GOV_LIMITS:        /* unused */


        case CPUFREQ_GOV_POLICY_INIT:   /* unused */


        case CPUFREQ_GOV_POLICY_EXIT:   /* unused */


                break;


}


return 0;



+}



+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED_CFS
+static
+#endif
+struct cpufreq_governor cpufreq_gov_cfs = {

.name                   = "gov_cfs",


.governor               = gov_cfs_setup,


.owner                  = THIS_MODULE,



+};



+static int __init gov_cfs_init(void)
+{

return cpufreq_register_governor(&cpufreq_gov_cfs);



+}



+static void __exit gov_cfs_exit(void)
+{

cpufreq_unregister_governor(&cpufreq_gov_cfs);



+}



+/* Try to make this the default governor */
+fs_initcall(gov_cfs_init);



+MODULE_LICENSE("GPL");

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 393fc36..a7b97f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4257,6 +4257,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
              update_rq_runnable_avg(rq, rq->nr_running);
              add_nr_running(rq, 1);
      }


if(sched_energy_freq())


        gov_cfs_update_cpu(cpu_of(rq));


hrtick_update(rq);

}

@@ -4318,6 +4322,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
              sub_nr_running(rq, 1);
              update_rq_runnable_avg(rq, 1);
      }


if(sched_energy_freq())


        gov_cfs_update_cpu(cpu_of(rq));


hrtick_update(rq);

}

@@ -7821,6 +7829,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
              task_tick_numa(rq, curr);
  update_rq_runnable_avg(rq, 1);



if(sched_energy_freq())


        gov_cfs_update_cpu(cpu_of(rq));

}
/*


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 63a8be9..ec23523 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1399,6 +1399,12 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
  int get_cpu_usage(int cpu);
  unsigned long capacity_orig_of(int cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED_CFS
+void gov_cfs_update_cpu(int cpu);
+#else
+static inline void gov_cfs_update_cpu(int cpu) {}
+#endif

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
    rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));

--
  http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro:  http://www.facebook.com/pages/Linaro Facebook |
http://twitter.com/#!/linaroorg Twitter |
http://www.linaro.org/linaro-blog/ Blog

eas-dev mailing list
eas-dev@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/eas-dev

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

Re: [Eas-dev] [RFC internal v2 4/4] sched: cpufreq_sched_cfs: PELT-based cpu frequency scaling