The interrupt framework gives a lot of information and statistics about each interrupt (time accounting, statistics, ...).
Unfortunately there is no way to measure when interrupts occur and provide a mathematical model for their behavior which could help in predicting their next occurence.
This framework allows for registering a callback function that is invoked when an interrupt occurs.
Each time an interrupt occurs, the callback will be called with the interval corresponding to the duration between two interrupts.
This framework allows a subsystem to register a handler in order to receive the timing information for the registered interrupt. That gives other subsystems the ability to compute predictions for the next interrupt occurence.
The main objective is to track and detect the periodic interrupts in order to predict the next event on a cpu and anticipate the sleeping time when entering idle. This fine grain approach allows to simplify and rationalize a wake up event prediction without IPIs interference, thus letting the scheduler to be smarter with the wakeup IPIs regarding the idle period.
The irq timings tracking showed, in the proof-of-concept, an improvement with the predictions, the approach is correct but my knowledge to the irq subsystem is limited. I am not sure this patch measuring irq time interval is correct or acceptable, so it is at the RFC state (minus some polishing).
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org --- include/linux/interrupt.h | 45 ++++++++++++++++++++++++++++++++ include/linux/irqdesc.h | 3 +++ kernel/irq/Kconfig | 4 +++ kernel/irq/handle.c | 12 +++++++++ kernel/irq/manage.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 128 insertions(+), 1 deletion(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index be7e75c..f48e8ff 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -123,6 +123,51 @@ struct irqaction {
extern irqreturn_t no_action(int cpl, void *dev_id);
+#ifdef CONFIG_IRQ_TIMINGS +/** + * timing handler to be called when an interrupt happens + */ +typedef void (*irqt_handler_t)(unsigned int, ktime_t, void *, void *); + +/** + * struct irqtimings - per interrupt irq timings descriptor + * @handler: interrupt handler timings function + * @data: pointer to the private data to be passed to the handler + * @timestamp: latest interruption occurence + */ +struct irqtimings { + irqt_handler_t handler; + void *data; +} ____cacheline_internodealigned_in_smp; + +/** + * struct irqt_ops - structure to be used by the subsystem to call the + * register and unregister ops when an irq is setup or freed. + * @setup: registering callback + * @free: unregistering callback + * + * The callbacks assumes the lock is held on the irq desc + */ +struct irqtimings_ops { + int (*setup)(unsigned int, struct irqaction *); + void (*free)(unsigned int, void *); +}; + +extern int register_irq_timings(struct irqtimings_ops *ops); +extern int setup_irq_timings(unsigned int irq, struct irqaction *act); +extern void free_irq_timings(unsigned int irq, void *dev_id); +#else +static inline int setup_irq_timings(unsigned int irq, struct irqaction *act) +{ + return 0; +} + +static inline void free_irq_timings(unsigned int irq, void *dev_id) +{ + ; +} +#endif + extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index a587a33..e0d4263 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -51,6 +51,9 @@ struct irq_desc { #ifdef CONFIG_IRQ_PREFLOW_FASTEOI irq_preflow_handler_t preflow_handler; #endif +#ifdef CONFIG_IRQ_TIMINGS + struct irqtimings *timings; +#endif struct irqaction *action; /* IRQ action list */ unsigned int status_use_accessors; unsigned int core_internal_state__do_not_mess_with_it; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9a76e3b..1275fd1 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -73,6 +73,10 @@ config GENERIC_MSI_IRQ_DOMAIN config HANDLE_DOMAIN_IRQ bool
+config IRQ_TIMINGS + bool + default y + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e25a83b..ca8b0c5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,6 +132,17 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); }
+#ifdef CONFIG_IRQ_TIMINGS +void handle_irqt_event(struct irqtimings *irqt, struct irqaction *action) +{ + if (irqt) + irqt->handler(action->irq, ktime_get(), + action->dev_id, irqt->data); +} +#else +#define handle_irqt_event(a, b) +#endif + irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { @@ -165,6 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: flags |= action->flags; + handle_irqt_event(desc->timings, action); break;
default: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9a59f6..21cc7bf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1017,6 +1017,60 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); }
+#ifdef CONFIG_IRQ_TIMINGS +/* + * Global variable, only used by accessor functions, currently only + * one user is allowed and it is up to the caller to make sure to + * setup the irq timings which are already setup. + */ +static struct irqtimings_ops *irqtimings_ops; + +/** + * register_irq_timings - register the ops when an irq is setup or freed + * + * @ops: the register/unregister ops to be called when at setup or + * free time + * + * Returns -EBUSY if the slot is already in use, zero on success. + */ +int register_irq_timings(struct irqtimings_ops *ops) +{ + if (irqtimings_ops) + return -EBUSY; + + irqtimings_ops = ops; + + return 0; +} + +/** + * setup_irq_timings - call the timing register callback + * + * @desc: an irq desc structure + * + * Returns -EINVAL in case of error, zero on success. + */ +int setup_irq_timings(unsigned int irq, struct irqaction *act) +{ + if (irqtimings_ops && irqtimings_ops->setup) + return irqtimings_ops->setup(irq, act); + return 0; +} + +/** + * free_irq_timings - call the timing unregister callback + * + * @irq: the interrupt number + * @dev_id: the device id + * + */ +void free_irq_timings(unsigned int irq, void *dev_id) +{ + if (irqtimings_ops && irqtimings_ops->free) + irqtimings_ops->free(irq, dev_id); +} +#endif /* CONFIG_IRQ_TIMINGS */ + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1037,6 +1091,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV;
+ ret = setup_irq_timings(irq, new); + if (ret) + goto out_mput; /* * Check whether the interrupt nests into another interrupt * thread. @@ -1045,7 +1102,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nested) { if (!new->thread_fn) { ret = -EINVAL; - goto out_mput; + goto out_free_timings; } /* * Replace the primary handler which was provided from @@ -1323,6 +1380,10 @@ out_thread: kthread_stop(t); put_task_struct(t); } + +out_free_timings: + free_irq_timings(irq, new->dev_id); + out_mput: module_put(desc->owner); return ret; @@ -1408,6 +1469,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
unregister_handler_proc(irq, action);
+ free_irq_timings(irq, dev_id); + /* Make sure it's not being used on another CPU: */ synchronize_irq(irq);
-- 1.9.1
Many IRQs are quiet most of the time, or they tend to come in bursts of fairly equal time intervals within each burst. It is therefore possible to detect those IRQs with stable intervals and guestimate when the next IRQ event is most likely to happen.
Examples of such IRQs may include audio related IRQs where the FIFO size and/or DMA descriptor size with the sample rate create stable intervals, block devices during large data transfers, etc. Even network streaming of multimedia content creates patterns of periodic network interface IRQs in some cases.
This patch adds code to track the mean interval and variance for each IRQ over a window of time intervals between IRQ events. Those statistics can be used to assist cpuidle in selecting the most appropriate sleep state by predicting the most likely time for the next interrupt.
Because the stats are gathered in interrupt context, the core computation is as light as possible.
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org --- drivers/cpuidle/Kconfig | 5 + kernel/irq/Kconfig | 1 - kernel/sched/Makefile | 1 + kernel/sched/idle-sched.c | 547 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 553 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/idle-sched.c
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 8c7930b..dd17215 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -25,6 +25,11 @@ config CPU_IDLE_GOV_MENU bool "Menu governor (for tickless system)" default y
+config CPU_IDLE_GOV_SCHED + bool "Sched idle governor" + select IRQ_TIMINGS + default y + config DT_IDLE_STATES bool
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1275fd1..81557ae 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -75,7 +75,6 @@ config HANDLE_DOMAIN_IRQ
config IRQ_TIMINGS bool - default y
config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 6768797..f7d5a35 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c new file mode 100644 index 0000000..45c6e4c --- /dev/null +++ b/kernel/sched/idle-sched.c @@ -0,0 +1,547 @@ +/* + * Copyright (C) 2016 Linaro Ltd, Daniel Lezcano daniel.lezcano@linaro.org + * Nicolas Pitre nicolas.pitre@linaro.org + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/irqdesc.h> +#include <linux/ktime.h> +#include <linux/slab.h> +#include <linux/tick.h> +#include <linux/time64.h> + +/* + * Define the number of values we will be dealing with to the monitor + * the interruptions. We define 2^x values to do stats on and as it is + * a multiple of 2, bits shifting will be faster than an optimized + * division. + */ +#define STATS_MAX_VALUE 4 + +/** + * struct stats - internal structure to encapsulate stats informations + * + * @sum: sum of the values + * @sq_sum: sum of the square values + * @values: array of values to do stats on + * @n: current number of values + * @w_ptr: current buffer pointer + */ +struct stats { + s64 sum; /* sum of values */ + u64 sq_sum; /* sum of the square values */ + s32 values[STATS_MAX_VALUE]; /* array of values */ + unsigned int n; /* current number of values */ + unsigned int w_ptr; /* current window pointer */ +}; + +/** + * struct wakeup - internal structure describing a source of wakeup + * + * @stats: the stats structure on the different event intervals + * @timestamp: latest update timestamp + */ +struct wakeup { + struct stats stats; + ktime_t timestamp; +}; + +/* + * Per cpu and irq statistics. Each cpu receives interrupts and those + * ones can be distributed following an irq chip specific + * algorithm. Random irq distribution is the worst case to predict + * interruption behavior but usually that does not happen or could be + * fixed from userspace by setting the irq affinity. + */ +static DEFINE_PER_CPU(struct wakeup, *wakesup[NR_IRQS]); + +/** + * stats_add - add a new value in the statistic structure + * + * @s: the statistic structure + * @value: the new value to be added, max 2^24 - 1 + * + * Adds the value to the array, if the array is full, then the array + * is shifted. + * + */ +static void stats_add(struct stats *s, s32 value) +{ + int limit = 1 << 24; + /* + * In order to prevent an overflow in the statistic code, we + * limit the value to 2^24 - 1, if it is greater we just + * ignore it with a WARN_ON_ONCE letting know to userspace we + * are dealing with out-of-range values. + */ + if (WARN_ON_ONCE(value >= (limit - 1) || value < -limit)) + return; + + /* + * Insert the value in the array. If the array is already + * full, shift the values and add the value at the end of the + * series, otherwise add the value directly. + */ + if (likely(s->n == STATS_MAX_VALUE)) { + s->sum -= s->values[s->w_ptr]; + s->sq_sum -= s->values[s->w_ptr] * s->values[s->w_ptr]; + s->values[s->w_ptr] = value; + s->w_ptr = (s->w_ptr + 1) % STATS_MAX_VALUE; + } else { + s->values[s->n] = value; + s->n++; + } + + /* + * In order to reduce the overhead and to prevent value + * derivation due to the integer computation, we just sum the + * value and do the division when the average and the variance + * are requested. + */ + s->sum += value; + s->sq_sum += (u64)value * value; +} + +/** + * stats_reset - reset the stats + * + * @s: the statistic structure + * + * Reset the statistics and reset the values + */ +static inline void stats_reset(struct stats *s) +{ + s->sum = s->sq_sum = s->n = s->w_ptr = 0; +} + +/** + * stats_mean - compute the average + * + * @s: the statistics structure + * + * Returns an s32 corresponding to the mean value, or zero if there is + * no data + */ +static inline s32 stats_mean(struct stats *s) +{ + /* + * gcc is smart enough to convert to a bits shift when the + * divisor is constant and multiple of 2^x. + * + * The number of values could have not reached STATS_MAX_VALUE + * yet, but we can consider it acceptable as the situation is + * only at the very first beginning in the system life cycle. + */ + return s->sum / STATS_MAX_VALUE; +} + +/** + * stats_variance - compute the variance + * + * @s: the statistic structure + * + * Returns an u64 corresponding to the variance, or zero if there is + * no data + */ +static u64 stats_variance(struct stats *s, s32 mean) +{ + return (s->sq_sum / STATS_MAX_VALUE) - ((u64)mean * mean); +} + +/** + * sched_idle_irq - irq timestamp callback + * + * @irq: the irq number + * @timestamp: when the interrupt occured + * @dev_id: device id for shared interrupt (not yet used) + * @data: private data given when registering this callback + * + * Interrupt callback called when an interrupt happens. This function + * is critical as it is called under an interrupt section: minimum + * operations as possible are done here: + */ +static void sched_idle_irq(unsigned int irq, ktime_t timestamp, + void *dev_id, void *data) +{ + s32 diff; + unsigned int cpu = raw_smp_processor_id(); + struct wakeup *w = per_cpu(wakesup[irq], cpu); + + if (!w) + return; + + /* + * It is the first time the interrupt occurs of the series, we + * can't do any stats as we don't have an interval, just store + * the timestamp and exit. + */ + if (!ktime_to_us(w->timestamp)) { + w->timestamp = timestamp; + return; + } + + /* + * Microsec resolution is enough for our purpose. + */ + diff = ktime_us_delta(timestamp, w->timestamp); + w->timestamp = timestamp; + + /* + * Add the value, it results in: + * - two substractions + * - two additions + * - two multiplications + * (without the sanity check) + */ + stats_add(&w->stats, diff); +} + +/* + * Callback to be called when an interrupt happens. + */ +static struct irqtimings irq_timings = { + .handler = sched_idle_irq, +}; + +static ktime_t next_irq_event(void) +{ + unsigned int irq, cpu = raw_smp_processor_id(); + ktime_t diff, next, min = (ktime_t){ .tv64 = KTIME_MAX }; + struct wakeup *w; + s32 interval, mean; + u64 variance; + + /* + * Lookup the interrupt array for this cpu and search for the + * earlier expected interruption. + */ + for (irq = 0; irq < NR_IRQS; irq++) { + w = per_cpu(wakesup[irq], cpu); + + /* + * The interrupt was not setup as a source of a wakeup + * or the wakeup source is not considered at this + * moment stable enough to do a prediction. + */ + if (!w) + continue; + + /* + * No statistics available yet. + */ + if (!ktime_to_us(w->timestamp)) + continue; + + /* + * If the mean value is null, just ignore this wakeup + * source. + */ + mean = stats_mean(&w->stats); + if (!mean) + continue; + + variance = stats_variance(&w->stats, mean); + /* + * We want to check the last interval is: + * + * mean - stddev < interval < mean + stddev + * + * That simplifies to: + * + * 0 < interval < 2 * stddev + * + * The standard deviation is the sqrt of the variance: + * + * 0 < interval < 2 * sqrt(variance) + * + * and we want to prevent to do an sqrt, so we square + * the equation: + * + * 0 < interval^2 < (2 * sqrt(variance))^2 + * + * 0 < interval^2 < 4 * variance + * + * As the time a monotonic positive function it is + * always greater than zero: + * + * interval^2 < 4 * variance + * + * So if the latest value of the stats complies with + * this condition, then the wakeup source is + * considered predictable and can be used to predict + * the next event. + */ + interval = w->stats.values[w->stats.n]; + if ((u64)interval * interval > 4 * variance) + continue; + + /* + * Let's compute the next event: the wakeup source is + * considered predictable, we add the average interval + * time added to the latest interruption event time. + */ + next = ktime_add_us(w->timestamp, stats_mean(&w->stats)); + + /* + * If the interrupt is supposed to happen before the + * minimum time, then it becomes the minimum. + */ + if (ktime_before(next, min)) + min = next; + } + + /* + * At this point, we have our prediction but the caller is + * expecting the remaining time before the next event, so + * compute the diff. + */ + diff = ktime_sub(min, ktime_get()); + + /* + * The result could be negative for different reasons: + * - the prediction is incorrect + * - the prediction was too near now and expired while we were + * in this function + * + * In both cases, we return KTIME_MAX as a failure to do a + * prediction + */ + if (ktime_compare(diff, ktime_set(0, 0)) <= 0) + return (ktime_t){ .tv64 = KTIME_MAX }; + + return diff; +} + +/** + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu + * + * The next event on the cpu is based on a statistic approach of the + * interrupt events and the timer deterministic value. From the timer + * or the irqs, we return the one expected to occur first. + * + * Returns the expected remaining idle time before being woken up by + * an interruption. + */ +s64 sched_idle_next_wakeup(void) +{ + s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length()); + s64 next_irq = ktime_to_us(next_irq_event()); + + return min(next_irq, next_timer); +} + +/** + * sched_idle - go to idle for a specified amount of time + * + * @duration: the idle duration time + * @latency: the latency constraint + * + * Returns 0 on success, < 0 otherwise. + */ +int sched_idle(s64 duration, unsigned int latency) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + struct cpuidle_state_usage *su; + struct cpuidle_state *s; + int i, ret = 0, index = -1; + + rcu_idle_enter(); + + /* + * No cpuidle driver is available, let's use the default arch + * idle function. + */ + if (cpuidle_not_available(drv, dev)) + goto default_idle; + + /* + * Find the idle state with the lowest power while satisfying + * our constraints. We will save energy if the duration of the + * idle time is bigger than the target residency which is the + * break even point. The choice will be modulated by the + * latency. + */ + for (i = 0; i < drv->state_count; i++) { + + s = &drv->states[i]; + + su = &dev->states_usage[i]; + + if (s->disabled || su->disable) + continue; + if (s->target_residency > duration) + continue; + if (s->exit_latency > latency) + continue; + + index = i; + } + + /* + * The idle task must be scheduled, it is pointless to go to + * idle, just re-enable the interrupt and return. + */ + if (current_clr_polling_and_test()) { + local_irq_enable(); + goto out; + } + + if (index < 0) { + /* + * No idle callbacks fulfilled the constraints, jump + * to the default function like there wasn't any + * cpuidle driver. + */ + goto default_idle; + } else { + /* + * Enter the idle state previously returned by the + * governor decision. This function will block until + * an interrupt occurs and will take care of + * re-enabling the local interrupts + */ + return cpuidle_enter(drv, dev, index); + } + +default_idle: + default_idle_call(); +out: + rcu_idle_exit(); + return ret; +} + +/** + * sched_irq_timing_free - free_irq callback + * + * @irq: the irq number to stop tracking + * @dev_id: not used at the moment + * + * This function will remove from the wakeup source prediction table. + */ +static void sched_irq_timing_free(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct wakeup *w; + unsigned int cpu; + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->timings = NULL; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + for_each_possible_cpu(cpu) { + w = per_cpu(wakesup[irq], cpu); + if (!w) + continue; + + per_cpu(wakesup[irq], cpu) = NULL; + kfree(w); + } +} + + +/** + * sched_irq_timing_setup - setup_irq callback + * + * @irq: the interrupt numbe to be tracked + * @act: the new irq action to be set to this interrupt + * + * Update the irq table to be tracked in order to predict the next event. + * + * Returns zero on success. On error it returns -ENOMEM. + */ +static int sched_irq_timing_setup(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct wakeup *w; + unsigned int cpu; + unsigned long flags; + int ret = -ENOMEM; + + /* + * No interrupt set for this descriptor or related to a timer. + * Timers are deterministic, so no need to try to do any + * prediction on them. No error for both cases, we are just not + * interested. + */ + if (!act || (act->flags & __IRQF_TIMER)) + return 0; + + /* + * Allocates the wakeup structure and the stats structure. As + * the interrupt can occur on any cpu, allocate the wakeup + * structure per cpu basis. + */ + for_each_possible_cpu(cpu) { + + w = kzalloc(sizeof(*w), GFP_KERNEL); + if (!w) + goto undo; + + per_cpu(wakesup[irq], cpu) = w; + } + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->timings = &irq_timings; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + ret = 0; +undo: + /* + * Rollback all allocations on failure + */ + if (ret) + sched_irq_timing_free(irq, act->dev_id); + + return ret; +} + +/* + * Setup/free irq callbacks + */ +static struct irqtimings_ops irqt_ops = { + .setup = sched_irq_timing_setup, + .free = sched_irq_timing_free, +}; + +/** + * sched_idle_init - setup the interrupt tracking table + * + * At init time, some interrupts could have been setup and in the + * system life time, some devices could be setup. In order to track + * all interrupts we are interested in, we first register a couple of + * callback to keep up-to-date the interrupt tracking table and then + * we setup the table with the interrupt which were already set up. + */ +int __init sched_idle_init(void) +{ + struct irq_desc *desc; + unsigned int irq; + int ret; + + /* + * Register the setup/free irq callbacks, so new interrupt or + * freed interrupt will update their tracking. + */ + ret = register_irq_timings(&irqt_ops); + if (ret) { + pr_err("Failed to register timings ops\n"); + return ret; + } + + /* + * For all the irq already setup, assign the timing callback. + * All interrupts with their desc NULL will be discarded. + */ + for_each_irq_desc(irq, desc) + sched_irq_timing_setup(irq, desc->action); + + return 0; +} +late_initcall(sched_idle_init); -- 1.9.1
On Tue, 15 Dec 2015, Daniel Lezcano wrote:
Many IRQs are quiet most of the time, or they tend to come in bursts of fairly equal time intervals within each burst. It is therefore possible to detect those IRQs with stable intervals and guestimate when the next IRQ event is most likely to happen.
Examples of such IRQs may include audio related IRQs where the FIFO size and/or DMA descriptor size with the sample rate create stable intervals, block devices during large data transfers, etc. Even network streaming of multimedia content creates patterns of periodic network interface IRQs in some cases.
This patch adds code to track the mean interval and variance for each IRQ over a window of time intervals between IRQ events. Those statistics can be used to assist cpuidle in selecting the most appropriate sleep state by predicting the most likely time for the next interrupt.
Because the stats are gathered in interrupt context, the core computation is as light as possible.
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org
Comments below.
drivers/cpuidle/Kconfig | 5 + kernel/irq/Kconfig | 1 - kernel/sched/Makefile | 1 + kernel/sched/idle-sched.c | 547 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 553 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/idle-sched.c
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig index 8c7930b..dd17215 100644 --- a/drivers/cpuidle/Kconfig +++ b/drivers/cpuidle/Kconfig @@ -25,6 +25,11 @@ config CPU_IDLE_GOV_MENU bool "Menu governor (for tickless system)" default y
+config CPU_IDLE_GOV_SCHED
- bool "Sched idle governor"
- select IRQ_TIMINGS
- default y
config DT_IDLE_STATES bool
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1275fd1..81557ae 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -75,7 +75,6 @@ config HANDLE_DOMAIN_IRQ
config IRQ_TIMINGS bool
- default y
config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 6768797..f7d5a35 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c new file mode 100644 index 0000000..45c6e4c --- /dev/null +++ b/kernel/sched/idle-sched.c @@ -0,0 +1,547 @@ +/*
- Copyright (C) 2016 Linaro Ltd, Daniel Lezcano daniel.lezcano@linaro.org
Nicolas Pitre <nicolas.pitre@linaro.org>
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation.
- */
+#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/irqdesc.h> +#include <linux/ktime.h> +#include <linux/slab.h> +#include <linux/tick.h> +#include <linux/time64.h>
+/*
- Define the number of values we will be dealing with to the monitor
- the interruptions. We define 2^x values to do stats on and as it is
- a multiple of 2, bits shifting will be faster than an optimized
- division.
- */
This comment is a bit confusing. What is 2^x relating to? And bit shifting is one possible form of an optimized division. Etc.
I'd suggest the following comment instead:
/* * Define the number of samples over which the average and variance * are computed. A power of 2 is preferred so to let the compiler * optimize divisions by that number with simple arithmetic shifts. */
+#define STATS_MAX_VALUE 4
This should also be something like STATS_MAX_NR_VALUES otherwise this could be interpreted as the maximum value used in the statistics.
+/**
- struct stats - internal structure to encapsulate stats informations
- @sum: sum of the values
- @sq_sum: sum of the square values
- @values: array of values to do stats on
- @n: current number of values
- @w_ptr: current buffer pointer
- */
+struct stats {
- s64 sum; /* sum of values */
- u64 sq_sum; /* sum of the square values */
- s32 values[STATS_MAX_VALUE]; /* array of values */
- unsigned int n; /* current number of values */
- unsigned int w_ptr; /* current window pointer */
Given we're dealing with time intervals here, they will always be positive values. All fields can therefore be unsigned to make things a little more efficient.
+};
+/**
- struct wakeup - internal structure describing a source of wakeup
- @stats: the stats structure on the different event intervals
- @timestamp: latest update timestamp
- */
+struct wakeup {
- struct stats stats;
- ktime_t timestamp;
+};
+/*
- Per cpu and irq statistics. Each cpu receives interrupts and those
- ones can be distributed following an irq chip specific
- algorithm. Random irq distribution is the worst case to predict
- interruption behavior but usually that does not happen or could be
- fixed from userspace by setting the irq affinity.
- */
+static DEFINE_PER_CPU(struct wakeup, *wakesup[NR_IRQS]);
s/wakesup/wakeups/
+/**
- stats_add - add a new value in the statistic structure
- @s: the statistic structure
- @value: the new value to be added, max 2^24 - 1
- Adds the value to the array, if the array is full, then the array
- is shifted.
Strictly speaking, nothing is shifted. Instead you could say "if the array is full, the oldest value is replaced."
- */
+static void stats_add(struct stats *s, s32 value) +{
- int limit = 1 << 24;
- /*
* In order to prevent an overflow in the statistic code, we
* limit the value to 2^24 - 1, if it is greater we just
* ignore it with a WARN_ON_ONCE letting know to userspace we
* are dealing with out-of-range values.
*/
- if (WARN_ON_ONCE(value >= (limit - 1) || value < -limit))
return;
This limit check is rather useless. Given we're adding microsecs intervals here, 2^24 microsecs is equal to 4.6 hours. IRQ predictions of that nature make no sense. In fact, any IRQ interval greater than a few seconds should simply reset the stats.
/*
* Insert the value in the array. If the array is already
* full, shift the values and add the value at the end of the
* series, otherwise add the value directly.
*/
- if (likely(s->n == STATS_MAX_VALUE)) {
s->sum -= s->values[s->w_ptr];
s->sq_sum -= s->values[s->w_ptr] * s->values[s->w_ptr];
Same overflow issue: you should cast one term to a 64-bit value. like you did below.
s->values[s->w_ptr] = value;
s->w_ptr = (s->w_ptr + 1) % STATS_MAX_VALUE;
- } else {
s->values[s->n] = value;
s->n++;
- }
/*
* In order to reduce the overhead and to prevent value
* derivation due to the integer computation, we just sum the
* value and do the division when the average and the variance
* are requested.
*/
- s->sum += value;
- s->sq_sum += (u64)value * value;
Now... let's assume we do want to limit statistics on IRQs that are less than a few seconds appart. We can have everything fit in 32-bit values. Assuming the maximum value of an u32, the sum of squared values may not exceed (2^32)-1. You have STATS_MAX_VALUE amount of such values. That means:
sqrt( ((2^32)-1) / STATS_MAX_VALUE ) = 32767
So... with 4 values we can fit everything in 32 bits if the IRQ interval is limited to 32 seconds. If STATS_MAX_VALUE is 8 then the max interval should be 23 seconds. That interval is probably way larger than practically predictable.
+}
+/**
- stats_reset - reset the stats
- @s: the statistic structure
- Reset the statistics and reset the values
- */
+static inline void stats_reset(struct stats *s) +{
- s->sum = s->sq_sum = s->n = s->w_ptr = 0;
+}
+/**
- stats_mean - compute the average
- @s: the statistics structure
- Returns an s32 corresponding to the mean value, or zero if there is
- no data
- */
+static inline s32 stats_mean(struct stats *s) +{
- /*
* gcc is smart enough to convert to a bits shift when the
* divisor is constant and multiple of 2^x.
*
* The number of values could have not reached STATS_MAX_VALUE
* yet, but we can consider it acceptable as the situation is
* only at the very first beginning in the system life cycle.
*/
- return s->sum / STATS_MAX_VALUE;
+}
+/**
- stats_variance - compute the variance
- @s: the statistic structure
- Returns an u64 corresponding to the variance, or zero if there is
- no data
- */
+static u64 stats_variance(struct stats *s, s32 mean) +{
- return (s->sq_sum / STATS_MAX_VALUE) - ((u64)mean * mean);
+}
+/**
- sched_idle_irq - irq timestamp callback
- @irq: the irq number
- @timestamp: when the interrupt occured
- @dev_id: device id for shared interrupt (not yet used)
- @data: private data given when registering this callback
- Interrupt callback called when an interrupt happens. This function
- is critical as it is called under an interrupt section: minimum
- operations as possible are done here:
- */
+static void sched_idle_irq(unsigned int irq, ktime_t timestamp,
void *dev_id, void *data)
+{
- s32 diff;
- unsigned int cpu = raw_smp_processor_id();
- struct wakeup *w = per_cpu(wakesup[irq], cpu);
- if (!w)
return;
- /*
* It is the first time the interrupt occurs of the series, we
* can't do any stats as we don't have an interval, just store
* the timestamp and exit.
*/
- if (!ktime_to_us(w->timestamp)) {
Here ktime_to_us() implies a division. It is a division with a constant divisor so it gets optimized into a multiplication by the reciprocal, but still rather costly just for the purpose of comparing with zero.
Instead, you should use:
if (!w->timestamp.tv64)
Or if you want to preserve the ktime_t abstraction you could use:
if (ktime_equal(w->timestamp, ktime_set(0, 0))
w->timestamp = timestamp;
return;
- }
- /*
* Microsec resolution is enough for our purpose.
*/
- diff = ktime_us_delta(timestamp, w->timestamp);
- w->timestamp = timestamp;
- /*
* Add the value, it results in:
* - two substractions
* - two additions
* - two multiplications
* (without the sanity check)
The (questionnable) sanity check notwitstanding, if you want to be precise, you'd have to take into account the addition from the pointer updating and the modulo (which ought to be turned into a mask). The code is trivial and obvious enough (more than what my code looked like) that it is probably not worth tracking those operations here.
- stats_add(&w->stats, diff);
+}
+/*
- Callback to be called when an interrupt happens.
- */
+static struct irqtimings irq_timings = {
- .handler = sched_idle_irq,
+};
+static ktime_t next_irq_event(void) +{
- unsigned int irq, cpu = raw_smp_processor_id();
- ktime_t diff, next, min = (ktime_t){ .tv64 = KTIME_MAX };
To be consistent, you should use:
min = ktime_set(KTIME_SEC_MAX, 0)
Same thing for the return value below.
- struct wakeup *w;
- s32 interval, mean;
- u64 variance;
- /*
* Lookup the interrupt array for this cpu and search for the
* earlier expected interruption.
*/
for (irq = 0; irq < NR_IRQS; irq++) {
w = per_cpu(wakesup[irq], cpu);
/*
* The interrupt was not setup as a source of a wakeup
* or the wakeup source is not considered at this
* moment stable enough to do a prediction.
*/
if (!w)
continue;
/*
* No statistics available yet.
*/
if (!ktime_to_us(w->timestamp))
continue;
Same comment as before.
/*
* If the mean value is null, just ignore this wakeup
* source.
*/
mean = stats_mean(&w->stats);
if (!mean)
continue;
variance = stats_variance(&w->stats, mean);
/*
* We want to check the last interval is:
*
* mean - stddev < interval < mean + stddev
*
* That simplifies to:
*
* 0 < interval < 2 * stddev
That isn't mathematically correct.
The only simplification you may do here is:
mean - stddev < interval < mean + stddev
-> -stddev < interval - mean < stddev
*
* The standard deviation is the sqrt of the variance:
*
* 0 < interval < 2 * sqrt(variance)
*
* and we want to prevent to do an sqrt, so we square
* the equation:
*
* 0 < interval^2 < (2 * sqrt(variance))^2
*
* 0 < interval^2 < 4 * variance
Just imagine that you have a perfectly periodic interrupt that always happens at a precise interval. In that case the variance will be zero. You'll discard it although it is perfectly predictable.
Given the correct expression, that should instead be:
-stddev < interval - mean < stddev
--> abs(interval - mean) < stddev
--> abs(interval - mean) < sqrt(variance)
--> (interval - mean)^2 < variance
*
* As the time a monotonic positive function it is
* always greater than zero:
*
* interval^2 < 4 * variance
*
* So if the latest value of the stats complies with
* this condition, then the wakeup source is
* considered predictable and can be used to predict
* the next event.
*/
interval = w->stats.values[w->stats.n];
if ((u64)interval * interval > 4 * variance)
continue;
/*
* Let's compute the next event: the wakeup source is
* considered predictable, we add the average interval
* time added to the latest interruption event time.
*/
next = ktime_add_us(w->timestamp, stats_mean(&w->stats));
/*
* If the interrupt is supposed to happen before the
* minimum time, then it becomes the minimum.
*/
if (ktime_before(next, min))
min = next;
- }
- /*
* At this point, we have our prediction but the caller is
* expecting the remaining time before the next event, so
* compute the diff.
*/
- diff = ktime_sub(min, ktime_get());
- /*
* The result could be negative for different reasons:
* - the prediction is incorrect
* - the prediction was too near now and expired while we were
* in this function
*
* In both cases, we return KTIME_MAX as a failure to do a
* prediction
*/
- if (ktime_compare(diff, ktime_set(0, 0)) <= 0)
return (ktime_t){ .tv64 = KTIME_MAX };
- return diff;
+}
+/**
- sched_idle_next_wakeup - Predict the next wakeup on the current cpu
- The next event on the cpu is based on a statistic approach of the
- interrupt events and the timer deterministic value. From the timer
- or the irqs, we return the one expected to occur first.
- Returns the expected remaining idle time before being woken up by
- an interruption.
- */
+s64 sched_idle_next_wakeup(void) +{
- s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
- s64 next_irq = ktime_to_us(next_irq_event());
- return min(next_irq, next_timer);
+}
+/**
- sched_idle - go to idle for a specified amount of time
- @duration: the idle duration time
- @latency: the latency constraint
- Returns 0 on success, < 0 otherwise.
- */
+int sched_idle(s64 duration, unsigned int latency) +{
- struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
- struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
- struct cpuidle_state_usage *su;
- struct cpuidle_state *s;
int i, ret = 0, index = -1;
rcu_idle_enter();
- /*
* No cpuidle driver is available, let's use the default arch
* idle function.
*/
if (cpuidle_not_available(drv, dev))
goto default_idle;
/*
* Find the idle state with the lowest power while satisfying
* our constraints. We will save energy if the duration of the
* idle time is bigger than the target residency which is the
* break even point. The choice will be modulated by the
* latency.
*/
for (i = 0; i < drv->state_count; i++) {
s = &drv->states[i];
su = &dev->states_usage[i];
if (s->disabled || su->disable)
continue;
if (s->target_residency > duration)
continue;
if (s->exit_latency > latency)
continue;
index = i;
}
/*
* The idle task must be scheduled, it is pointless to go to
* idle, just re-enable the interrupt and return.
*/
- if (current_clr_polling_and_test()) {
local_irq_enable();
goto out;
- }
- if (index < 0) {
/*
* No idle callbacks fulfilled the constraints, jump
* to the default function like there wasn't any
* cpuidle driver.
*/
goto default_idle;
- } else {
/*
* Enter the idle state previously returned by the
* governor decision. This function will block until
* an interrupt occurs and will take care of
* re-enabling the local interrupts
*/
return cpuidle_enter(drv, dev, index);
- }
+default_idle:
- default_idle_call();
+out:
- rcu_idle_exit();
- return ret;
+}
+/**
- sched_irq_timing_free - free_irq callback
- @irq: the irq number to stop tracking
- @dev_id: not used at the moment
- This function will remove from the wakeup source prediction table.
- */
+static void sched_irq_timing_free(unsigned int irq, void *dev_id) +{
- struct irq_desc *desc = irq_to_desc(irq);
- struct wakeup *w;
- unsigned int cpu;
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->timings = NULL;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- for_each_possible_cpu(cpu) {
w = per_cpu(wakesup[irq], cpu);
if (!w)
continue;
per_cpu(wakesup[irq], cpu) = NULL;
kfree(w);
What if another CPU is currently in next_irq_event and has a reference to w while it is freed?
- }
+}
+/**
- sched_irq_timing_setup - setup_irq callback
- @irq: the interrupt numbe to be tracked
- @act: the new irq action to be set to this interrupt
- Update the irq table to be tracked in order to predict the next event.
- Returns zero on success. On error it returns -ENOMEM.
- */
+static int sched_irq_timing_setup(unsigned int irq, struct irqaction *act) +{
- struct irq_desc *desc = irq_to_desc(irq);
- struct wakeup *w;
- unsigned int cpu;
- unsigned long flags;
- int ret = -ENOMEM;
- /*
* No interrupt set for this descriptor or related to a timer.
* Timers are deterministic, so no need to try to do any
* prediction on them. No error for both cases, we are just not
* interested.
*/
- if (!act || (act->flags & __IRQF_TIMER))
return 0;
- /*
* Allocates the wakeup structure and the stats structure. As
* the interrupt can occur on any cpu, allocate the wakeup
* structure per cpu basis.
*/
- for_each_possible_cpu(cpu) {
w = kzalloc(sizeof(*w), GFP_KERNEL);
if (!w)
goto undo;
per_cpu(wakesup[irq], cpu) = w;
- }
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->timings = &irq_timings;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- ret = 0;
+undo:
- /*
* Rollback all allocations on failure
*/
- if (ret)
sched_irq_timing_free(irq, act->dev_id);
- return ret;
+}
+/*
- Setup/free irq callbacks
- */
+static struct irqtimings_ops irqt_ops = {
- .setup = sched_irq_timing_setup,
- .free = sched_irq_timing_free,
+};
+/**
- sched_idle_init - setup the interrupt tracking table
- At init time, some interrupts could have been setup and in the
- system life time, some devices could be setup. In order to track
- all interrupts we are interested in, we first register a couple of
- callback to keep up-to-date the interrupt tracking table and then
- we setup the table with the interrupt which were already set up.
- */
+int __init sched_idle_init(void) +{
- struct irq_desc *desc;
- unsigned int irq;
- int ret;
- /*
* Register the setup/free irq callbacks, so new interrupt or
* freed interrupt will update their tracking.
*/
- ret = register_irq_timings(&irqt_ops);
- if (ret) {
pr_err("Failed to register timings ops\n");
return ret;
- }
- /*
* For all the irq already setup, assign the timing callback.
* All interrupts with their desc NULL will be discarded.
*/
- for_each_irq_desc(irq, desc)
sched_irq_timing_setup(irq, desc->action);
- return 0;
+}
+late_initcall(sched_idle_init);
1.9.1
On 12/15/2015 11:16 PM, Nicolas Pitre wrote:
On Tue, 15 Dec 2015, Daniel Lezcano wrote:
Many IRQs are quiet most of the time, or they tend to come in bursts of fairly equal time intervals within each burst. It is therefore possible to detect those IRQs with stable intervals and guestimate when the next IRQ event is most likely to happen.
Examples of such IRQs may include audio related IRQs where the FIFO size and/or DMA descriptor size with the sample rate create stable intervals, block devices during large data transfers, etc. Even network streaming of multimedia content creates patterns of periodic network interface IRQs in some cases.
This patch adds code to track the mean interval and variance for each IRQ over a window of time intervals between IRQ events. Those statistics can be used to assist cpuidle in selecting the most appropriate sleep state by predicting the most likely time for the next interrupt.
Because the stats are gathered in interrupt context, the core computation is as light as possible.
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org
Comments below.
[ ... ]
+/*
- Define the number of values we will be dealing with to the monitor
- the interruptions. We define 2^x values to do stats on and as it is
- a multiple of 2, bits shifting will be faster than an optimized
- division.
- */
This comment is a bit confusing. What is 2^x relating to? And bit shifting is one possible form of an optimized division. Etc.
I'd suggest the following comment instead:
/*
- Define the number of samples over which the average and variance
- are computed. A power of 2 is preferred so to let the compiler
- optimize divisions by that number with simple arithmetic shifts.
*/
Ok.
+#define STATS_MAX_VALUE 4
This should also be something like STATS_MAX_NR_VALUES otherwise this could be interpreted as the maximum value used in the statistics.
Agree.
+/**
- struct stats - internal structure to encapsulate stats informations
- @sum: sum of the values
- @sq_sum: sum of the square values
- @values: array of values to do stats on
- @n: current number of values
- @w_ptr: current buffer pointer
- */
+struct stats {
- s64 sum; /* sum of values */
- u64 sq_sum; /* sum of the square values */
- s32 values[STATS_MAX_VALUE]; /* array of values */
- unsigned int n; /* current number of values */
- unsigned int w_ptr; /* current window pointer */
Given we're dealing with time intervals here, they will always be positive values. All fields can therefore be unsigned to make things a little more efficient.
Agree.
+};
+/**
- struct wakeup - internal structure describing a source of wakeup
- @stats: the stats structure on the different event intervals
- @timestamp: latest update timestamp
- */
+struct wakeup {
- struct stats stats;
- ktime_t timestamp;
+};
+/*
- Per cpu and irq statistics. Each cpu receives interrupts and those
- ones can be distributed following an irq chip specific
- algorithm. Random irq distribution is the worst case to predict
- interruption behavior but usually that does not happen or could be
- fixed from userspace by setting the irq affinity.
- */
+static DEFINE_PER_CPU(struct wakeup, *wakesup[NR_IRQS]);
s/wakesup/wakeups/
Noted.
+/**
- stats_add - add a new value in the statistic structure
- @s: the statistic structure
- @value: the new value to be added, max 2^24 - 1
- Adds the value to the array, if the array is full, then the array
- is shifted.
Strictly speaking, nothing is shifted. Instead you could say "if the array is full, the oldest value is replaced."
Agree.
- */
+static void stats_add(struct stats *s, s32 value) +{
- int limit = 1 << 24;
- /*
* In order to prevent an overflow in the statistic code, we
* limit the value to 2^24 - 1, if it is greater we just
* ignore it with a WARN_ON_ONCE letting know to userspace we
* are dealing with out-of-range values.
*/
- if (WARN_ON_ONCE(value >= (limit - 1) || value < -limit))
return;
This limit check is rather useless. Given we're adding microsecs intervals here, 2^24 microsecs is equal to 4.6 hours. IRQ predictions of that nature make no sense. In fact, any IRQ interval greater than a few seconds should simply reset the stats.
Ok.
In the previous code, when the irq interval was greater than one second then we reset the stats. I noted a strange behavior with the network where the results were worst. Actually on my system test, the hardware is a bit weird: there are for the same cpu several interrupts for the same NIC and it does round-robin at several seconds interval, so all interrupts were always discarded. Removing the stats reset improved the situation. I preferred to remove this portion of code in order to refocus on it later with a rational rather than 'one second is long enough'.
I also, suspect we may endup with a tasklet to do more complex statistics, especially using a standard error for a burst of intervals.
/*
* Insert the value in the array. If the array is already
* full, shift the values and add the value at the end of the
* series, otherwise add the value directly.
*/
- if (likely(s->n == STATS_MAX_VALUE)) {
s->sum -= s->values[s->w_ptr];
s->sq_sum -= s->values[s->w_ptr] * s->values[s->w_ptr];
Same overflow issue: you should cast one term to a 64-bit value. like you did below.
Good point.
s->values[s->w_ptr] = value;
s->w_ptr = (s->w_ptr + 1) % STATS_MAX_VALUE;
- } else {
s->values[s->n] = value;
s->n++;
- }
/*
* In order to reduce the overhead and to prevent value
* derivation due to the integer computation, we just sum the
* value and do the division when the average and the variance
* are requested.
*/
- s->sum += value;
- s->sq_sum += (u64)value * value;
Now... let's assume we do want to limit statistics on IRQs that are less than a few seconds appart. We can have everything fit in 32-bit values. Assuming the maximum value of an u32, the sum of squared values may not exceed (2^32)-1. You have STATS_MAX_VALUE amount of such values. That means:
sqrt( ((2^32)-1) / STATS_MAX_VALUE ) = 32767
So... with 4 values we can fit everything in 32 bits if the IRQ interval is limited to 32 seconds.
Why 32 seconds ? It isn't 32 milli-second ?
If STATS_MAX_VALUE is 8 then the max interval should be 23 seconds. That interval is probably way larger than practically predictable.
[ ... ]
- /*
* It is the first time the interrupt occurs of the series, we
* can't do any stats as we don't have an interval, just store
* the timestamp and exit.
*/
- if (!ktime_to_us(w->timestamp)) {
Here ktime_to_us() implies a division. It is a division with a constant divisor so it gets optimized into a multiplication by the reciprocal, but still rather costly just for the purpose of comparing with zero.
Instead, you should use:
if (!w->timestamp.tv64)
Or if you want to preserve the ktime_t abstraction you could use:
if (ktime_equal(w->timestamp, ktime_set(0, 0))
Ok.
w->timestamp = timestamp;
return;
- }
- /*
* Microsec resolution is enough for our purpose.
*/
- diff = ktime_us_delta(timestamp, w->timestamp);
- w->timestamp = timestamp;
- /*
* Add the value, it results in:
* - two substractions
* - two additions
* - two multiplications
* (without the sanity check)
The (questionnable) sanity check notwitstanding, if you want to be precise, you'd have to take into account the addition from the pointer updating and the modulo (which ought to be turned into a mask). The code is trivial and obvious enough (more than what my code looked like) that it is probably not worth tracking those operations here.
Yes, this comment will lead to infinite discussions and close the door for adding new operations.
- stats_add(&w->stats, diff);
+}
+/*
- Callback to be called when an interrupt happens.
- */
+static struct irqtimings irq_timings = {
- .handler = sched_idle_irq,
+};
+static ktime_t next_irq_event(void) +{
- unsigned int irq, cpu = raw_smp_processor_id();
- ktime_t diff, next, min = (ktime_t){ .tv64 = KTIME_MAX };
To be consistent, you should use:
min = ktime_set(KTIME_SEC_MAX, 0)
Same thing for the return value below.
Ok.
- struct wakeup *w;
- s32 interval, mean;
- u64 variance;
- /*
* Lookup the interrupt array for this cpu and search for the
* earlier expected interruption.
*/
for (irq = 0; irq < NR_IRQS; irq++) {
w = per_cpu(wakesup[irq], cpu);
/*
* The interrupt was not setup as a source of a wakeup
* or the wakeup source is not considered at this
* moment stable enough to do a prediction.
*/
if (!w)
continue;
/*
* No statistics available yet.
*/
if (!ktime_to_us(w->timestamp))
continue;
Same comment as before.
Ok.
/*
* If the mean value is null, just ignore this wakeup
* source.
*/
mean = stats_mean(&w->stats);
if (!mean)
continue;
variance = stats_variance(&w->stats, mean);
/*
* We want to check the last interval is:
*
* mean - stddev < interval < mean + stddev
*
* That simplifies to:
*
* 0 < interval < 2 * stddev
That isn't mathematically correct.
The only simplification you may do here is:
mean - stddev < interval < mean + stddev
-> -stddev < interval - mean < stddev
*
* The standard deviation is the sqrt of the variance:
*
* 0 < interval < 2 * sqrt(variance)
*
* and we want to prevent to do an sqrt, so we square
* the equation:
*
* 0 < interval^2 < (2 * sqrt(variance))^2
*
* 0 < interval^2 < 4 * variance
Just imagine that you have a perfectly periodic interrupt that always happens at a precise interval. In that case the variance will be zero. You'll discard it although it is perfectly predictable.
Grumble ...
Given the correct expression, that should instead be:
-stddev < interval - mean < stddev
--> abs(interval - mean) < stddev
--> abs(interval - mean) < sqrt(variance)
--> (interval - mean)^2 < variance
Ok, I will fix this.
*
* As the time a monotonic positive function it is
* always greater than zero:
*
* interval^2 < 4 * variance
*
* So if the latest value of the stats complies with
* this condition, then the wakeup source is
* considered predictable and can be used to predict
* the next event.
*/
interval = w->stats.values[w->stats.n];
if ((u64)interval * interval > 4 * variance)
continue;
[ ... ]
+/**
- sched_irq_timing_free - free_irq callback
- @irq: the irq number to stop tracking
- @dev_id: not used at the moment
- This function will remove from the wakeup source prediction table.
- */
+static void sched_irq_timing_free(unsigned int irq, void *dev_id) +{
- struct irq_desc *desc = irq_to_desc(irq);
- struct wakeup *w;
- unsigned int cpu;
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->timings = NULL;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- for_each_possible_cpu(cpu) {
w = per_cpu(wakesup[irq], cpu);
if (!w)
continue;
per_cpu(wakesup[irq], cpu) = NULL;
kfree(w);
What if another CPU is currently in next_irq_event and has a reference to w while it is freed?
At the first glance, I don't think that can happen.
If we are in the next_irq_event, the lock is held preventing free_irq to be called. Then interrupts are disabled and the irq handler is unset, preventing the interrupt handler with the next_irq_event hook to be called.
However the IRQ code is complex and I will double check this.
Thanks for the review Nico.
-- Daniel
[ ... ]
-- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs
Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog
On Wed, 16 Dec 2015, Daniel Lezcano wrote:
On 12/15/2015 11:16 PM, Nicolas Pitre wrote:
On Tue, 15 Dec 2015, Daniel Lezcano wrote:
+static void stats_add(struct stats *s, s32 value) +{
- int limit = 1 << 24;
- /*
* In order to prevent an overflow in the statistic code, we
* limit the value to 2^24 - 1, if it is greater we just
* ignore it with a WARN_ON_ONCE letting know to userspace we
* are dealing with out-of-range values.
*/
- if (WARN_ON_ONCE(value >= (limit - 1) || value < -limit))
return;
This limit check is rather useless. Given we're adding microsecs intervals here, 2^24 microsecs is equal to 4.6 hours. IRQ predictions of that nature make no sense. In fact, any IRQ interval greater than a few seconds should simply reset the stats.
OK, as noted below, I mixed up ms with us. 2^24 microsecs is actually 16 seconds. That's probably fairly common. More below.
Ok.
In the previous code, when the irq interval was greater than one second then we reset the stats. I noted a strange behavior with the network where the results were worst. Actually on my system test, the hardware is a bit weird: there are for the same cpu several interrupts for the same NIC and it does round-robin at several seconds interval, so all interrupts were always discarded. Removing the stats reset improved the situation. I preferred to remove this portion of code in order to refocus on it later with a rational rather than 'one second is long enough'.
I also, suspect we may endup with a tasklet to do more complex statistics, especially using a standard error for a burst of intervals.
Now... let's assume we do want to limit statistics on IRQs that are less than a few seconds appart. We can have everything fit in 32-bit values. Assuming the maximum value of an u32, the sum of squared values may not exceed (2^32)-1. You have STATS_MAX_VALUE amount of such values. That means:
sqrt( ((2^32)-1) / STATS_MAX_VALUE ) = 32767
So... with 4 values we can fit everything in 32 bits if the IRQ interval is limited to 32 seconds.
Why 32 seconds ? It isn't 32 milli-second ?
Doh. Indeed. Scratch that one.
That means the actual limit with 64 bits would be:
sqrt( ((2^64)-1) / STATS_MAX_VALUE ) = 2147483647
That's 35 minutes. No IRQ interval prediction should ever be that long. So the limit of 16 seconds above is questionnable and probably unjustified here.
The limit should be part of the policy at a higher level. Silently dropping intervals greater than 16 seconds without resetting the stats is _possibly_ a good thing to do. But right now it happens rather by accident rather than by design.
I'd suggest documenting the restriction for stats_add() saying that the value must be less than sqrt(((2^64)-1) / STATS_MAX_VALUE) and then enforce a cutoff policy at the caller. Right now it is 16 seconds, but maybe some other value might provide even better results.
Nicolas
This patch plugs the sched-idle code into the idle. It is experimental code and consequently it disables the suspend to idle code as this one is not included in the sched-idle.
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org --- kernel/sched/idle-sched.c | 30 ++++++++++++++++++++++++++++++ kernel/sched/idle.c | 11 +++++++++-- kernel/sched/sched.h | 20 ++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c index 45c6e4c..e078956 100644 --- a/kernel/sched/idle-sched.c +++ b/kernel/sched/idle-sched.c @@ -60,6 +60,36 @@ struct wakeup { */ static DEFINE_PER_CPU(struct wakeup, *wakesup[NR_IRQS]);
+/* + * Variable flag to switch from cpuidle to idle-sched + */ +static int __read_mostly __sched_idle_enabled = 0; + +/* + * Enable the sched_idle subsystem + */ +void sched_idle_enable(void) +{ + __sched_idle_enabled = 1; +} + +/* + * Disable the sched_idle subsystem + */ +void sched_idle_disable(void) +{ + __sched_idle_enabled = 0; +} + +/* + * Accessor flag to check if this idle subsystem should be used + * instead of the old cpuidle framework. + */ +int sched_idle_enabled(void) +{ + return __sched_idle_enabled; +} + /** * stats_add - add a new value in the statistic structure * diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a..c3f9938 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -247,8 +247,15 @@ static void cpu_idle_loop(void) */ if (cpu_idle_force_poll || tick_check_broadcast_expired()) cpu_idle_poll(); - else - cpuidle_idle_call(); + else { + if (sched_idle_enabled()) { + int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); + s64 duration = sched_idle_next_wakeup(); + sched_idle(duration, latency); + } else { + cpuidle_idle_call(); + } + }
arch_cpu_idle_exit(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d2a119..4f80048 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -35,6 +35,26 @@ extern void update_cpu_load_active(struct rq *this_rq); static inline void update_cpu_load_active(struct rq *this_rq) { } #endif
+#ifdef CONFIG_CPU_IDLE_GOV_SCHED +extern int sched_idle(s64 duration, unsigned int latency); +extern s64 sched_idle_next_wakeup(void); +extern int sched_idle_enabled(void); +#else +static inline int sched_idle(s64, unsigned int) +{ + return 0; +} + +static inline s64 sched_idle_next_wakeup(void) +{ +} + +static inline int sched_idle_enabled(void) +{ + return 0; +} +#endif + /* * Helpers for converting nanosecond timing to jiffy resolution */ -- 1.9.1