It is usually interesting to do some statistics on a set of data, especially when the values are coming in a stream (measured time by time for instance).
Instead of having the statistics formula inside a specific subsystem, this small library provides the basic statistics functions available for all the kernel.
The library is designed to do the minimum computation when a new value is added. Only the basic value storage, array shifting and accumulation is done.
The average, variance and standard deviation is computed when requested via the corresponding functions.
The statistic library can deal up to 65536 values in the range of -2^24 and 2^24 - 1. These are large values and does not really make sense to the kernel code to use the statistics at these limits, so it is up to developer to use wisely the library.
Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org --- include/linux/stats.h | 29 +++++++ lib/Makefile | 3 +- lib/stats.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 include/linux/stats.h create mode 100644 lib/stats.c
diff --git a/include/linux/stats.h b/include/linux/stats.h new file mode 100644 index 0000000..664eb30 --- /dev/null +++ b/include/linux/stats.h @@ -0,0 +1,29 @@ +/* + * include/linux/stats.h + */ +#ifndef _LINUX_STATS_H +#define _LINUX_STATS_H + +struct stats { + s64 sum; /* sum of values */ + u64 sq_sum; /* sum of the square values */ + s32 *values; /* array of values */ + s32 min; /* minimal value of the entire series */ + s32 max; /* maximal value of the entire series */ + unsigned int n; /* current number of values */ + unsigned int w_ptr; /* current window pointer */ + unsigned short len; /* size of the value array */ +}; + +extern s32 stats_max(struct stats *s); +extern s32 stats_min(struct stats *s); +extern s32 stats_mean(struct stats *s); +extern u32 stats_variance(struct stats *s); +extern u32 stats_stddev(struct stats *s); +extern void stats_add(struct stats *s, s32 val); +extern void stats_reset(struct stats *s); +extern void stats_free(struct stats *s); +extern struct stats *stats_alloc(unsigned short len); +extern unsigned short stats_n(struct stats *s); + +#endif /* _LINUX_STATS_H */ diff --git a/lib/Makefile b/lib/Makefile index 13a7c6a..18460d2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -26,7 +26,8 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o + percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \ + stats.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o diff --git a/lib/stats.c b/lib/stats.c new file mode 100644 index 0000000..f5425d1 --- /dev/null +++ b/lib/stats.c @@ -0,0 +1,235 @@ +/* + * Implementation of basics statistics functions useful to compute on + * a stream of data. + * + * Copyright: (C) 2015-2016 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/stats.h> +#include <linux/types.h> + +/** + * stats_n - number of elements present in the statistics + * + * @s: the statistic structure + * + * Returns an unsigned short representing the number of values in the + * statistics series + */ +unsigned short stats_n(struct stats *s) +{ + return s->n; +} + +/** + * stats_max - maximal value of the series + * + * @s: the statistic structure + * + * Returns a s32 representing the maximum value of the series + */ +s32 stats_max(struct stats *s) +{ + return s->max; +} +EXPORT_SYMBOL_GPL(stats_max); + +/** + * stats_min - minimal value of the series + * + * @s: the statistic structure + * + * Returns a s32 representing the minimal value of the series + */ +s32 stats_min(struct stats *s) +{ + return s->min; +} +EXPORT_SYMBOL_GPL(stats_min); + +/** + * stats_mean - compute the average + * + * @s: the statistics structure + * + * Returns an s32 corresponding to the mean value, or zero if there is + * no data + */ +s32 stats_mean(struct stats *s) +{ + return s->n ? s->sum / s->n : 0; +} +EXPORT_SYMBOL_GPL(stats_mean); + +/** + * stats_variance - compute the variance + * + * @s: the statistic structure + * + * Returns an u32 corresponding to the variance, or zero if there is no + * data + */ +u32 stats_variance(struct stats *s) +{ + s32 mean = stats_mean(s); + return s->n ? (s->sq_sum / s->n) - (mean * mean) : 0; +} +EXPORT_SYMBOL_GPL(stats_variance); + +/** + * stats_stddev - compute the standard deviation + * + * @s: the statistic structure + * + * Returns an u32 corresponding to the standard deviation, or zero if + * there is no data + */ +u32 stats_stddev(struct stats *s) +{ + return int_sqrt(stats_variance(s)); +} +EXPORT_SYMBOL_GPL(stats_stddev); + +/** + * stats_add - add a new value in the statistic structure + * + * @s: the statistic structure + * @value: the new value to be added, max 2^24 - 1 + * + * Adds the value to the array, if the array is full, then the array + * is shifted. + * + */ +void stats_add(struct stats *s, s32 value) +{ + /* + * In order to prevent an overflow in the statistic code, we + * limit the value to 2^24 - 1, if it is greater we just + * ignore it with a WARN_ON_ONCE letting know to userspace we + * are dealing with out-of-range values. + */ + if (WARN_ON_ONCE(value >= ((2<<24) - 1))) + return; + + /* + * Insert the value in the array. If the array is already + * full, shift the values and add the value at the end of the + * series, otherwise add the value directly. + */ + if (likely(s->len == s->n)) { + s->sum -= s->values[s->w_ptr]; + s->sq_sum -= s->values[s->w_ptr] * s->values[s->w_ptr]; + s->values[s->w_ptr] = value; + s->w_ptr = (s->w_ptr + 1) % s->len; + } else { + s->values[s->n] = value; + s->n++; + } + + /* + * Keep track of the min and max. + */ + s->min = min(s->min, value); + s->max = max(s->max, value); + + /* + * In order to reduce the overhead and to prevent value + * derivation due to the integer computation, we just sum the + * value and do the division when the average and the variance + * are requested. + */ + s->sum += value; + s->sq_sum += value * value; +} +EXPORT_SYMBOL_GPL(stats_add); + +/** + * stats_reset - reset the stats + * + * @s: the statistic structure + * + * Reset the statistics and reset the values + * + */ +void stats_reset(struct stats *s) +{ + s->sum = s->sq_sum = s->n = s->w_ptr = 0; + s->max = S32_MIN; + s->min = S32_MAX; +} +EXPORT_SYMBOL_GPL(stats_reset); + +/** + * stats_alloc - allocate a structure for statistics + * + * @len: The number of items in the array which is limited by the + * unsigned short type. That allows to prevent overflow in all + * the statistics code. + * + * Allocates memory to store the different values and initialize the + * structure. + * + * In order to prevent an overflow in the computation, the maximum + * allowed number of values is 65536 and each value max is 2^24 - 1. + * + * The variance is the sum of the square value of the difference of + * each value to the average. The variance is a u64 (square values are + * always positives), so that gives a maximum of 18446744073709551615. + * We can store 65536 values, so: + * + * 18446744073709551615 / 65536 = 281474976710655 + * + * ... is the square max value we can have, hence the difference to + * the mean is max sqrt(281474976710655) = 16777215 (2^24 -1) + * + * Even if these values are not realistics (statistic in the kernel is + * for a few hundred values, large dispersion in the integer limits is + * very very rare so the sum won't be very high, or high integers + * series means low variance), we prevent any overflow in the code and + * we are safe. + * + * Returns a valid pointer a struct stats, NULL if the memory + * allocation fails. + */ +struct stats *stats_alloc(unsigned short len) +{ + struct stats *s; + s32 *values; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (s) { + values = kzalloc(sizeof(*values) * len, GFP_KERNEL); + if (!values) { + kfree(s); + return NULL; + } + + s->values = values; + s->len = len; + s->min = S32_MAX; + s->max = S32_MIN; + } + + return s; +} +EXPORT_SYMBOL_GPL(stats_alloc); + +/** + * stats_free - free the statistics structure + * + * @s : the statistics structure + * + * Frees the memory allocated by the function stats_alloc. + */ +void stats_free(struct stats *s) +{ + kfree(s->values); + kfree(s); +} +EXPORT_SYMBOL_GPL(stats_free); -- 1.9.1