[Eas-dev] [RFC PATCH v1 1/3] sched: Introduce Window Assisted Load Tracking (WALT) to track CPU utilization

2 Sep 2016

From: Srivatsa Vaddagiri vatsa@codeaurora.org
This patch implements an alternative window-based CPU utilization
tracking mechanism in the scheduler. Per task and per CPU counters are
updated with utilization statistics using a synchronized (across CPUs)
time source and a single statistic (prev_runnable_sum) is fed to the
registered utilization callback listeners. A windowed view of time
(window size determined by walt_ravg_window) is used to determine
CPU utilization.
There are two per-CPU-rq quantities maintained by WALT, both normalized to
the max possible frequency and the max efficiency (IPC) of that CPU:
curr_runnable_sum: aggregate utilization of all tasks that executed
    during the current (not yet completed) window
prev_runnable_sum: aggregate utilization of all tasks that executed
    during the most recent completed window
prev_runnable_sum is the primary stastic used to guide CPU frequency in
lieu of PELT's cfs_rq->util_avg. No additional policy is imposed on this
statistic, the assumption being that the consumer (e.g., schedutil) will
perform appropriate policy decisions (e.g., margin) before deciding the
next P-state.
Corresponding to the aggregate statistics, WALT also mantains the
following stats per task:
curr_window - represents the cpu utilization of the task in its most
    recently tracked window
prev_window - represents cpu utilization of task in the window prior
    to the one being tracked by curr_window
WALT statistic updates are event driven, with updates occurring in
scheduler_tick, pick_next_task and put_prev_task (i.e, in context_switch),
task wakeup and during task migration. Migration simply involves removing a
tasks's curr_window and prev_window from the source CPU's curr_runnable sum
and prev_runnable_sum, and adding the per-task counters to the destination
CPU's aggregate CPU counters. Execution time in an IRQ handler is accounted
in a CPU's curr_runnable_sum statistic, provided that the CPU was also
executing the idle task for the duration of the interrupt handler.
Idle task handling is modified by walt_io_is_busy; when set to 1, if a CPU
rq has tasks blocked on IO, idle-task execution is accounted in per-task
and per-CPU counters. Setting walt_io_is_busy will also cause interrupt
handlers in the idle task to update counters as if the idle task was
executing (instead of just the interrupt handler execution time).
The major tunable provided by WALT is walt_ravg_window, which represents
window size (in nanoseconds) and is set to 20ms by default. walt_io_is_busy
(described above) is set to 0 by default.
Potential upcoming changes/improvements include: the use of sched_clock
instead of ktime_get as a time source, support for an unsynchronized
(across CPUs) time source, and integration with mainlined CPU efficiency
APIs.
Signed-off-by: Srivatsa Vaddagiri vatsa@codeaurora.org
Signed-off-by: Vikram Mulukutla markivx@codeaurora.org
---
 include/linux/sched.h            |  35 +++
 include/linux/sched/sysctl.h     |   1 +
 include/trace/events/sched.h     |  76 ++++++
 init/Kconfig                     |   9 +
 kernel/sched/Makefile            |   1 +
 kernel/sched/core.c              |  28 +-
 kernel/sched/cpufreq_schedutil.c |   7 +-
 kernel/sched/cputime.c           |  11 +-
 kernel/sched/debug.c             |  10 +
 kernel/sched/fair.c              |   7 +-
 kernel/sched/sched.h             |  10 +
 kernel/sched/walt.c              | 540 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/walt.h              |  73 ++++++
 kernel/sysctl.c                  |   9 +
 14 files changed, 812 insertions(+), 5 deletions(-)
 create mode 100644 kernel/sched/walt.c
 create mode 100644 kernel/sched/walt.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f..56e708f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -314,6 +314,17 @@ extern char ___assert_task_state[1 - 2*!!(
 /* Task command name length */
 #define TASK_COMM_LEN 16
+enum task_event {
+	PUT_PREV_TASK   = 0,
+	PICK_NEXT_TASK  = 1,
+	TASK_WAKE       = 2,
+	TASK_MIGRATE    = 3,
+	TASK_UPDATE     = 4,
+	IRQ_UPDATE      = 5,
+};
+
+extern char *task_event_names[];
+
 #include <linux/spinlock.h>
/*
@@ -1318,6 +1329,25 @@ struct sched_statistics {
 };
 #endif
+#ifdef CONFIG_SCHED_WALT
+
+/* ravg represents capacity scaled cpu-usage of tasks */
+struct ravg {
+	/*
+	 * 'mark_start' marks the most recent event for a task
+	 *
+	 * 'curr_window' represents task's cpu usage in its most recent
+	 * window
+	 *
+	 * 'prev_window' represents task's cpu usage in the window prior
+	 * to the one represented by 'curr_window'
+	*/
+	u64 mark_start;
+	u32 curr_window, prev_window;
+};
+#endif
+
+
 struct sched_entity {
    struct load_weight	load;		/* for load-balancing */
    struct rb_node		run_node;
@@ -1478,6 +1508,11 @@ struct task_struct {
    const struct sched_class *sched_class;
    struct sched_entity se;
    struct sched_rt_entity rt;
+
+#ifdef CONFIG_SCHED_WALT
+	struct ravg ravg;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
    struct task_group *sched_task_group;
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e6..7007815 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -31,6 +31,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_sched_use_walt_metrics;
#ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9b90c57..2adf245 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -562,6 +562,82 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
TP_printk("cpu=%d", __entry->cpu)
 );
+
+TRACE_EVENT(sched_walt_util,
+
+	TP_PROTO(int cpu, unsigned long cfs_util_avg, unsigned long walt_util),
+
+	TP_ARGS(cpu, cfs_util_avg, walt_util),
+
+	TP_STRUCT__entry(
+		__field(		int,	cpu		)
+		__field(	unsigned long,	cfs_util_avg	)
+		__field(	unsigned long,	walt_util	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= cpu;
+		__entry->cfs_util_avg	= cfs_util_avg;
+		__entry->walt_util	= walt_util;
+	),
+
+	TP_printk("cpu %d cfs_util_avg %lu walt_util %lu", __entry->cpu,  __entry->cfs_util_avg, __entry->walt_util)
+);
+
+struct rq;
+
+TRACE_EVENT(sched_walt_update_task_ravg,
+
+	TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, u64 wallclock, u64 irqtime),
+
+	TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,   TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	pid_t,	cur_pid			)
+		__field(unsigned int,	cpu			)
+		__field(unsigned int,	cur_freq		)
+		__field(	u64,	wallclock		)
+		__field(	u64,	mark_start		)
+		__field(	u64,	win_start		)
+		__field(	u64,	irqtime			)
+		__field(enum task_event,	evt		)
+		__field(	u64,	rq_cs			)
+		__field(	u64,	rq_ps			)
+		__field(	u32,	curr_window		)
+		__field(	u32,	prev_window		)
+	),
+
+	TP_fast_assign(
+		__entry->wallclock      = wallclock;
+		__entry->win_start      = rq->window_start;
+		__entry->evt            = evt;
+		__entry->cpu            = rq->cpu;
+		__entry->cur_pid        = rq->curr->pid;
+		__entry->cur_freq       = rq->cur_freq;
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid            = p->pid;
+		__entry->mark_start     = p->ravg.mark_start;
+		__entry->irqtime        = irqtime;
+		__entry->rq_cs          = rq->curr_runnable_sum;
+		__entry->rq_ps          = rq->prev_runnable_sum;
+		__entry->curr_window	= p->ravg.curr_window;
+		__entry->prev_window	= p->ravg.prev_window;
+	),
+
+	TP_printk("wc %llu ws %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu irqtime %llu rq_cs %llu rq_ps %llu cur_window %u prev_window %u"
+		, __entry->wallclock, __entry->win_start,
+		task_event_names[__entry->evt], __entry->cpu,
+		__entry->cur_freq, __entry->cur_pid,
+		__entry->pid, __entry->comm, __entry->mark_start,
+		__entry->irqtime,
+		__entry->rq_cs, __entry->rq_ps,
+		__entry->curr_window, __entry->prev_window
+		)
+);
+
+
 #endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index f755a60..e259273 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -388,6 +388,15 @@ config IRQ_TIME_ACCOUNTING
endchoice
+config SCHED_WALT
+	bool "Support window based load tracking"
+	depends on SMP
+	help
+	This feature will allow the scheduler to maintain a tunable window
+	based set of metrics for tasks and runqueues. These metrics can be
+	used to guide task placement as well as task frequency requirements
+	for cpufreq governors.
+
 config BSD_PROCESS_ACCT
    bool "BSD Process Accounting"
    depends on MULTIUSER
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b83..41ada04 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,6 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51d7105..068bde8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#include "walt.h"
+
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1241,6 +1243,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
    		p->sched_class->migrate_task_rq(p);
    	p->se.nr_migrations++;
    	perf_event_task_migrate(p);
+
+		walt_fixup_busy_time(p, new_cpu);
    }
__set_task_cpu(p, new_cpu);
@@ -2049,6 +2053,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
     */
    smp_cond_acquire(!p->on_cpu);
+	raw_spin_lock(&task_rq(p)->lock);
+	walt_update_task_ravg(p, task_rq(p), TASK_WAKE, walt_ktime_clock(), 0);
+	raw_spin_unlock(&task_rq(p)->lock);
+
    p->sched_contributes_to_load = !!task_contributes_to_load(p);
    p->state = TASK_WAKING;
@@ -2106,8 +2114,10 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
-	if (!task_on_rq_queued(p))
+	if (!task_on_rq_queued(p)) {
+		walt_update_task_ravg(p, rq, TASK_WAKE, walt_ktime_clock(), 0);
    	ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+	}
ttwu_do_wakeup(rq, p, 0, cookie);
    if (schedstat_enabled())
@@ -2173,6 +2183,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
    p->se.nr_migrations		= 0;
    p->se.vruntime			= 0;
    INIT_LIST_HEAD(&p->se.group_node);
+	walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
    p->se.cfs_rq			= NULL;
@@ -2540,6 +2551,8 @@ void wake_up_new_task(struct task_struct *p)
    rq = __task_rq_lock(p, &rf);
    post_init_entity_util_avg(&p->se);
+	walt_mark_task_starting(p);
+
    activate_task(rq, p, 0);
    p->on_rq = TASK_ON_RQ_QUEUED;
    trace_sched_wakeup_new(p);
@@ -3023,6 +3036,8 @@ void scheduler_tick(void)
    update_rq_clock(rq);
    curr->sched_class->task_tick(rq, curr, 0);
    cpu_load_update_active(rq);
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                        walt_ktime_clock(), 0);
    calc_global_load_tick(rq);
    raw_spin_unlock(&rq->lock);
@@ -3271,6 +3286,7 @@ static void __sched notrace __schedule(bool preempt)
    struct pin_cookie cookie;
    struct rq *rq;
    int cpu;
+	u64 wallclock;
cpu = smp_processor_id();
    rq = cpu_rq(cpu);
@@ -3334,6 +3350,9 @@ static void __sched notrace __schedule(bool preempt)
    	update_rq_clock(rq);
next = pick_next_task(rq, prev, cookie);
+	wallclock = walt_ktime_clock();
+	walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+	walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
    clear_tsk_need_resched(prev);
    clear_preempt_need_resched();
    rq->clock_skip_update = 0;
@@ -7229,6 +7248,9 @@ int sched_cpu_deactivate(unsigned int cpu)
 static void sched_rq_cpu_starting(unsigned int cpu)
 {
    struct rq *rq = cpu_rq(cpu);
+
+	if (!rq->window_start)
+        	walt_set_window_start(rq);
rq->calc_load_update = calc_load_update;
    account_reset_rq(rq);
@@ -7251,6 +7273,9 @@ int sched_cpu_dying(unsigned int cpu)
    /* Handle pending wakeups and then migrate everything off */
    sched_ttwu_pending();
    raw_spin_lock_irqsave(&rq->lock, flags);
+
+	walt_migrate_sync_cpu(cpu);
+
    if (rq->rd) {
    	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
    	set_rq_offline(rq);
@@ -7270,6 +7295,7 @@ void __init sched_init_smp(void)
 {
    cpumask_var_t non_isolated_cpus;
+	walt_init_cpu_efficiency();
    alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
    alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa2..2eef34d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -15,8 +15,10 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
+#include <trace/events/sched.h>
#include "sched.h"
+#include "walt.h"
struct sugov_tunables {
    struct gov_attr_set attr_set;
@@ -97,6 +99,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
policy->cur = next_freq;
    	trace_cpu_frequency(next_freq, smp_processor_id());
+		walt_freq_transition(smp_processor_id(), next_freq);
    } else if (sg_policy->next_freq != next_freq) {
    	sg_policy->next_freq = next_freq;
    	sg_policy->work_in_progress = true;
@@ -125,7 +128,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 static unsigned int get_next_freq(struct cpufreq_policy *policy,
    			  unsigned long util, unsigned long max)
 {
-	unsigned int freq = arch_scale_freq_invariant() ?
+	int invariant = (sysctl_sched_use_walt_metrics ||
+				arch_scale_freq_invariant());
+	unsigned int freq = invariant ?
    			policy->cpuinfo.max_freq : policy->cur;
return (freq + (freq >> 2)) * util / max;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5..af9cf3e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -52,6 +52,8 @@ void irqtime_account_irq(struct task_struct *curr)
    unsigned long flags;
    s64 delta;
    int cpu;
+	u64 wallclock;
+	boot account = true;
if (!sched_clock_irqtime)
    	return;
@@ -59,7 +61,8 @@ void irqtime_account_irq(struct task_struct *curr)
    local_irq_save(flags);
cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	wallclock = sched_clock_cpu(cpu);
+	delta = wallclock - __this_cpu_read(irq_start_time);
    __this_cpu_add(irq_start_time, delta);
irq_time_write_begin();
@@ -73,8 +76,14 @@ void irqtime_account_irq(struct task_struct *curr)
    	__this_cpu_add(cpu_hardirq_time, delta);
    else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
    	__this_cpu_add(cpu_softirq_time, delta);
+	else
+		account = false;
irq_time_write_end();
+
+	if (account && is_idle_task(curr))
+		walt_account_irqtime(cpu, curr, delta, wallclock);
+
    local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c39..3fe8b89 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -607,6 +607,16 @@ do {									\
    P(nr_switches);
    P(nr_load_updates);
    P(nr_uninterruptible);
+#ifdef CONFIG_SMP
+	P(cpu_capacity_orig);
+	P(cpu_capacity);
+#ifdef CONFIG_SCHED_WALT
+	P(window_start);
+	P(cur_freq);
+	P(curr_runnable_sum);
+	P(prev_runnable_sum);
+#endif
+#endif
    PN(next_balance);
    SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
    PN(clock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdcbeea..8724299 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -29,11 +29,13 @@
 #include <linux/interrupt.h>
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
+#include <linux/module.h>
 #include <linux/task_work.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "walt.h"
/*
  * Targeted preemption latency for CPU-bound tasks:
@@ -2882,6 +2884,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
    	unsigned long max = rq->cpu_capacity_orig;
+		unsigned long util = cpu_walt_util(rq);
/*
    	 * There are a few boundary cases this might miss but it should
@@ -2899,8 +2902,8 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
    	 *
    	 * See cpu_util().
    	 */
-		cpufreq_update_util(rq_clock(rq),
-				    min(cfs_rq->avg.util_avg, max), max);
+
+		cpufreq_update_util(rq_clock(rq), min(util, max), max);
    }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7cbeb92..52a0ac5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,6 +663,16 @@ struct rq {
    u64 max_idle_balance_cost;
 #endif
+#ifdef CONFIG_SCHED_WALT
+	unsigned int cur_freq;
+	struct cpumask freq_domain_cpumask;
+
+	int efficiency; /* Differentiate cpus with different IPC capability */
+	u64 window_start;
+	u64 curr_runnable_sum;
+	u64 prev_runnable_sum;
+#endif /* CONFIG_SCHED_WALT */
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
    u64 prev_irq_time;
 #endif
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 0000000..203e02d
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ * 2016-08-31: Integration with mainline by Srivatsa Vaddagiri
+ *             and Vikram Mulukutla
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+
+char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
+				"TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
+				"IRQ_UPDATE"};
+
+__read_mostly unsigned int sysctl_sched_use_walt_metrics = 1;
+
+static __read_mostly unsigned int walt_freq_account_wait_time;
+static __read_mostly unsigned int walt_io_is_busy;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+static unsigned int __read_mostly walt_disabled;
+
+static unsigned int max_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+u64 walt_ktime_clock(void)
+{
+	if (unlikely(walt_ktime_suspended))
+		return ktime_to_ns(ktime_last);
+	return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+	walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+	ktime_last = ktime_get();
+	walt_ktime_suspended = true;
+	return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+	.resume	= walt_resume,
+	.suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+	register_syscore_ops(&walt_syscore_ops);
+	return 0;
+}
+late_initcall(walt_init_ops);
+
+static int __init set_walt_ravg_window(char *str)
+{
+	get_option(&str, &walt_ravg_window);
+
+	walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+				walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+	return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+	s64 delta;
+	int nr_windows;
+	u64 prev_sum = 0;
+
+	delta = wallclock - rq->window_start;
+	BUG_ON(delta < 0);
+	if (delta < walt_ravg_window)
+		return;
+
+	nr_windows = div64_u64(delta, walt_ravg_window);
+	if (nr_windows == 1)
+		prev_sum = rq->curr_runnable_sum;
+
+	rq->prev_runnable_sum = prev_sum;
+	rq->curr_runnable_sum = 0;
+
+	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+	unsigned int cur_freq = rq->cur_freq;
+	int sf;
+
+	/* round up div64 */
+	delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+			  max_possible_freq);
+
+	sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+	delta *= sf;
+	delta >>= 10;
+
+	return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+	if (!walt_io_is_busy)
+		return 0;
+
+	return atomic_read(&rq->nr_iowait);
+}
+
+static int account_cpu_busy_time(struct rq *rq, struct task_struct *p,
+				     u64 irqtime, int event)
+{
+	if (is_idle_task(p)) {
+		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+		if (event == PICK_NEXT_TASK)
+			return 0;
+
+		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+		return irqtime || cpu_is_waiting_on_io(rq);
+	}
+
+	if (event == TASK_WAKE)
+		return 0;
+
+	if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+					 event == TASK_UPDATE)
+		return 1;
+
+	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+	return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+	     int event, u64 wallclock, u64 irqtime)
+{
+	int new_window, nr_full_windows = 0;
+	u64 mark_start = p->ravg.mark_start;
+	u64 window_start = rq->window_start;
+	u32 window_size = walt_ravg_window;
+	u64 delta;
+
+	new_window = mark_start < window_start;
+	if (new_window)
+		nr_full_windows = div64_u64((window_start - mark_start),
+								window_size);
+
+	/* Handle window rollover */
+	if (new_window) {
+		if (!is_idle_task(p)) {
+			u32 curr_window = 0;
+
+			if (!nr_full_windows)
+				curr_window = p->ravg.curr_window;
+
+			p->ravg.prev_window = curr_window;
+			p->ravg.curr_window = 0;
+		}
+	}
+
+	if (!account_cpu_busy_time(rq, p, irqtime, event))
+		return;
+
+	if (!new_window) {
+		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+			delta = wallclock - mark_start;
+		else
+			delta = irqtime;
+		delta = scale_exec_time(delta, rq);
+		rq->curr_runnable_sum += delta;
+		if (!is_idle_task(p))
+			p->ravg.curr_window += delta;
+
+		return;
+	}
+
+	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+		if (!nr_full_windows) {
+			/* A full window hasn't elapsed, account partial
+			 * contribution to previous completed window. */
+			delta = scale_exec_time(window_start - mark_start, rq);
+			p->ravg.prev_window += delta;
+		} else {
+			/* Since at least one full window has elapsed,
+			 * the contribution to the previous window is the
+			 * full window (window_size). */
+			delta = scale_exec_time(window_size, rq);
+			p->ravg.prev_window = delta;
+		}
+		rq->prev_runnable_sum += delta;
+
+		/* Account piece of busy time in the current window. */
+		delta = scale_exec_time(wallclock - window_start, rq);
+		rq->curr_runnable_sum += delta;
+		p->ravg.curr_window = delta;
+
+		return;
+	}
+
+	if (irqtime) {
+		/* IRQ busy time start = wallclock - irqtime */
+		mark_start = wallclock - irqtime;
+
+		if (mark_start > window_start) {
+			rq->curr_runnable_sum += scale_exec_time(irqtime, rq);
+			return;
+		}
+
+		/*
+		 * IRQ busy time spanned multiple windows. Process the
+		 * busy time preceding the current window first
+		 */
+		delta = window_start - mark_start;
+		if (delta > window_size)
+			delta = window_size;
+		delta = scale_exec_time(delta, rq);
+		rq->prev_runnable_sum += delta;
+
+		/* Process the remaining IRQ busy time in the current window. */
+		delta = wallclock - window_start;
+		rq->curr_runnable_sum += scale_exec_time(delta, rq);
+
+		return;
+	}
+
+	BUG();
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+	     enum task_event event, u64 wallclock, u64 irqtime)
+{
+	if (walt_disabled || !rq->window_start)
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	update_window_start(rq, wallclock);
+
+	if (!p->ravg.mark_start)
+		goto done;
+
+	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+	trace_sched_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+	p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+	return 1024;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+	int i, efficiency;
+	unsigned int max = 0;
+
+	for_each_possible_cpu(i) {
+		efficiency = arch_get_cpu_efficiency(i);
+		cpu_rq(i)->efficiency = efficiency;
+
+		if (efficiency > max)
+			max = efficiency;
+	}
+
+	if (max)
+		max_possible_efficiency = max;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+	u64 wallclock;
+	struct rq *rq = task_rq(p);
+
+	if (!rq->window_start)
+		return;
+
+	wallclock = walt_ktime_clock();
+	p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+	int cpu = cpu_of(rq);
+	struct rq *sync_rq = cpu_rq(sync_cpu);
+	unsigned long flags;
+
+	if (!rq->cur_freq || rq->window_start ||
+				walt_ktime_clock() < walt_ravg_window)
+		return;
+
+	if (cpu == sync_cpu) {
+		raw_spin_lock_irqsave(&rq->lock, flags);
+		rq->window_start = walt_ktime_clock();
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	} else {
+		local_irq_save(flags);
+		double_rq_lock(rq, sync_rq);
+		rq->window_start = cpu_rq(sync_cpu)->window_start;
+		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+		double_rq_unlock(rq, sync_rq);
+		local_irq_restore(flags);
+	}
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+	if (cpu == sync_cpu)
+		sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+	struct rq *src_rq = task_rq(p);
+	struct rq *dest_rq = cpu_rq(new_cpu);
+	u64 wallclock;
+
+	if (!p->on_rq && p->state != TASK_WAKING)
+		return;
+
+	if (p->state == TASK_WAKING)
+		double_rq_lock(src_rq, dest_rq);
+	else
+		double_lock_balance(src_rq, dest_rq);
+
+	/* Note that same wallclock reference is used for all 3 events below */
+	wallclock = walt_ktime_clock();
+
+	/* Update counters on both cpus first */
+	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+			TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(dest_rq->curr, dest_rq,
+			TASK_UPDATE, wallclock, 0);
+
+	/* Update task's counters */
+	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+	/* Fixup busy time */
+	if (p->ravg.curr_window) {
+		src_rq->curr_runnable_sum -= p->ravg.curr_window;
+		if (!dest_rq->window_start) {
+			p->ravg.curr_window = 0;
+			p->ravg.mark_start = 0;
+		}
+		dest_rq->curr_runnable_sum += p->ravg.curr_window;
+	}
+
+	if (p->ravg.prev_window) {
+		src_rq->prev_runnable_sum -= p->ravg.prev_window;
+		if (!dest_rq->window_start)
+			p->ravg.prev_window = 0;
+		dest_rq->prev_runnable_sum += p->ravg.prev_window;
+	}
+
+	if ((s64)src_rq->prev_runnable_sum < 0) {
+		src_rq->prev_runnable_sum = 0;
+		WARN_ON(1);
+	}
+	if ((s64)src_rq->curr_runnable_sum < 0) {
+		src_rq->curr_runnable_sum = 0;
+		WARN_ON(1);
+	}
+
+	if (p->state == TASK_WAKING)
+		double_rq_unlock(src_rq, dest_rq);
+	else
+		double_unlock_balance(src_rq, dest_rq);
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+	int i;
+
+	if (val != CPUFREQ_NOTIFY)
+		return 0;
+
+	for_each_cpu(i, policy->related_cpus) {
+		cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+			     policy->related_cpus);
+		cpu_rq(i)->cur_freq = policy->cur;
+		if (!cpu_rq(i)->window_start)
+			walt_set_window_start(cpu_rq(i));
+	}
+
+	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+
+	return 0;
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+				u64 delta, u64 wallclock)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	/*
+	 * cputime (wallclock) uses sched_clock so use the same here for
+	 * consistency.
+	 */
+	delta += sched_clock_cpu(cpu) - wallclock;
+
+	walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), delta);
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+int fast_switching = 1;
+
+void walt_freq_transition(int cpu, unsigned long new_freq)
+{
+	int i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+		struct rq *rq = cpu_rq(i);
+
+		if (!fast_switching ||
+		    (fast_switching && smp_processor_id() != i))
+			raw_spin_lock(&rq->lock);
+
+		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+				      walt_ktime_clock(), 0);
+		rq->cur_freq = new_freq;
+
+		if (!fast_switching ||
+		    (fast_switching && smp_processor_id() != i))
+			raw_spin_unlock(&rq->lock);
+		if (!rq->window_start)
+			walt_set_window_start(rq);
+	}
+	local_irq_restore(flags);
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+		unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+	unsigned int cpu = freq->cpu, new_freq = freq->new;
+
+	if (val != CPUFREQ_POSTCHANGE)
+		return 0;
+
+	BUG_ON(!new_freq);
+
+	if (cpu_rq(cpu)->cur_freq == new_freq)
+		return 0;
+
+	walt_freq_transition(cpu, new_freq);
+
+	return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+	.notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+	.notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+	int ret;
+
+	ret = cpufreq_register_notifier(&notifier_policy_block,
+						CPUFREQ_POLICY_NOTIFIER);
+
+	if (!fast_switching)
+		ret = cpufreq_register_notifier(&notifier_trans_block,
+						CPUFREQ_TRANSITION_NOTIFIER);
+
+	return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+	memset(&p->ravg, 0, sizeof(struct ravg));
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 0000000..5b03995
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, enum task_event event,
+		u64 wallclock, u64 irqtime);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+void walt_freq_transition(int cpu, unsigned long new_freq);
+
+extern unsigned int sysctl_sched_use_walt_metrics;
+extern unsigned int walt_ravg_window;
+
+/* Fold into cpu_util */
+static inline unsigned long cpu_walt_util(struct rq *rq)
+{
+	trace_sched_walt_util(cpu_of(rq), rq->cfs.avg.util_avg, rq->prev_runnable_sum *
+				rq->cpu_capacity_orig / walt_ravg_window);
+
+	if (!sysctl_sched_use_walt_metrics)
+		return rq->cfs.avg.util_avg;
+
+	return (rq->prev_runnable_sum * rq->cpu_capacity_orig) /
+					walt_ravg_window;
+}
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+		int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+static inline void walt_account_irqtime(int cpu, struct task_struct *curr,
+				u64 delta, u64 wallclock) { }
+
+static inline unsigned long cpu_walt_util(struct rq *rq)
+{
+	return rq->cfs.avg.util_avg;
+}
+
+static inline void walt_freq_transition(int cpu, unsigned long new_freq) { };
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_ravg_window;
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87b2fc3..4669a34 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -439,6 +439,15 @@ static struct ctl_table kern_table[] = {
    	.extra2		= &one,
    },
 #endif
+	{
+		.procname	= "sched_use_walt_metrics",
+		.data		= &sysctl_sched_use_walt_metrics,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #ifdef CONFIG_CFS_BANDWIDTH
    {
    	.procname	= "sched_cfs_bandwidth_slice_us",
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project

    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

[Eas-dev] [RFC PATCH v1 1/3] sched: Introduce Window Assisted Load Tracking (WALT) to track CPU utilization