This patch proposes a system-wide sysctl-aware default for the high-resolution timer slack value, which may be changed from 0 to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited across fork(); instead, newborn task uses system-wide value by default, and newborn thread uses it's group leader value.
Signed-off-by: Dmitry Antipov dmitry.antipov@linaro.org --- Documentation/sysctl/kernel.txt | 8 ++++++++ include/linux/hrtimer.h | 11 +++++++++++ include/linux/init_task.h | 2 +- include/linux/sched.h | 11 ++++++++--- kernel/fork.c | 9 +++++++-- kernel/futex.c | 4 ++-- kernel/hrtimer.c | 10 +++++++--- kernel/sys.c | 9 ++++----- kernel/sysctl.c | 10 ++++++++++ 9 files changed, 58 insertions(+), 16 deletions(-)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 6d78841..83b63ed 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -606,6 +606,14 @@ can be ORed together:
==============================================================
+timer_slack: + +This value can be used to query and set the default slack for +high-resolution timers, in nanoseconds. The default value is 50 +microseconds, and can be changed from 0 nanoseconds to 1 millisecond. + +============================================================== + unknown_nmi_panic:
The value in this file affects behavior of handling NMI. When the diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0dc30..77169b7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -24,6 +24,16 @@ #include <linux/timer.h> #include <linux/timerqueue.h>
+/* + * Default system-wide and per-task hrtimer slack, in nanoseconds. + */ +#define HRTIMER_DEFAULT_SLACK 50000 + +/* + * Reasonable limit for hrtimer slack. + */ +#define HRTIMER_MAX_SLACK 1000000 + struct hrtimer_clock_base; struct hrtimer_cpu_base;
@@ -323,6 +333,7 @@ extern ktime_t ktime_get_monotonic_offset(void);
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern int default_timer_slack_ns;
/* Exported timer functions: */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c66b1a..b29be0d 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -178,7 +178,7 @@ extern struct cred init_cred; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ - .timer_slack_ns = 50000, /* 50 usec default slack */ \ + .timer_slack_ns = HRTIMER_DEFAULT_SLACK, \ .pids = { \ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9b13f79..811e034 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1551,11 +1551,11 @@ struct task_struct { struct latency_record latency_record[LT_SAVECOUNT]; #endif /* - * time slack values; these are used to round up poll() and - * select() etc timeout values. These are in nanoseconds. + * High-resolution timer slack value, in nanoseconds. + * Used to round up poll()/select(), nanosleep, futex + * waiting, etc. timeout values of non-realtime tasks. */ unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns;
struct list_head *scm_work_list; #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -2631,6 +2631,11 @@ static inline int spin_needbreak(spinlock_t *lock) #endif }
+static inline unsigned long task_timer_slack(struct task_struct *tsk) +{ + return rt_task(tsk) ? 0 : tsk->timer_slack_ns; +} + /* * Thread group CPU time accounting. */ diff --git a/kernel/fork.c b/kernel/fork.c index b77fd55..6aaff93 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1164,8 +1164,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif - - p->default_timer_slack_ns = current->timer_slack_ns; + /* + * New thread inherits the slack from the group + * leader. New process uses system-default slack. + */ + p->timer_slack_ns = (clone_flags & CLONE_THREAD) ? + current->group_leader->timer_slack_ns : + default_timer_slack_ns;
task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/futex.c b/kernel/futex.c index 1614be2..a0d302d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1887,7 +1887,7 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); + task_timer_slack(current)); }
retry: @@ -2281,7 +2281,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); + task_timer_slack(current)); }
/* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf5..0c56fec 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -51,6 +51,12 @@ #include <trace/events/timer.h>
/* + * Default hrtimer slack value, in nanoseconds. May be changed in + * [0..HRTIMER_MAX_SLACK] range through kernel.timer_slack sysctl. + */ +__read_mostly int default_timer_slack_ns = HRTIMER_DEFAULT_SLACK; + +/* * The timer bases: * * There are more clockids then hrtimer bases. Thus, we index @@ -1564,9 +1570,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, int ret = 0; unsigned long slack;
- slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; + slack = task_timer_slack(current);
hrtimer_init_on_stack(&t.timer, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); diff --git a/kernel/sys.c b/kernel/sys.c index 4070153..ac32846 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/key.h> #include <linux/times.h> +#include <linux/hrtimer.h> #include <linux/posix-timers.h> #include <linux/security.h> #include <linux/dcookies.h> @@ -1917,12 +1918,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: - if (arg2 <= 0) - current->timer_slack_ns = - current->default_timer_slack_ns; - else + if (arg2 <= HRTIMER_MAX_SLACK) current->timer_slack_ns = arg2; - error = 0; + else + error = -EINVAL; break; case PR_MCE_KILL: if (arg4 | arg5) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f25..2cd42c6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,6 +136,7 @@ static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +static const int slack_max = HRTIMER_MAX_SLACK;
#ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> @@ -1004,6 +1005,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "timer_slack", + .data = &default_timer_slack_ns, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &slack_max, + }, { } };
On Mon, 20 Feb 2012 11:49:32 +0400 Dmitry Antipov dmitry.antipov@linaro.org wrote:
This patch proposes a system-wide sysctl-aware default for the high-resolution timer slack value, which may be changed from 0 to HRTIMER_MAX_SLACK nanoseconds. Default system-wide and per-task values are HRTIMER_DEFAULT_SLACK. Per-task value isn't inherited across fork(); instead, newborn task uses system-wide value by default, and newborn thread uses it's group leader value.
Well.. there are some back-incompatibilities here. prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to whatever-we-inherited-at-fork, but that has been removed. What are the implications of this, and did we need to do it?
If we do make changes in this area then the prctl manpage should be updated, please. And if http://www.spinics.net/lists/linux-man/msg01149.html represents the current state of that manpage then it should be updated anyway - that entry doesn't say anything about the (arg2 <= 0) case.
On 04/05/2012 04:10 AM, Andrew Morton wrote:
Well.. there are some back-incompatibilities here. prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to whatever-we-inherited-at-fork, but that has been removed. What are the implications of this, and did we need to do it?
It seems you're looking at the previous version of this patch (http://lkml.org/lkml/2012/2/20/55). Latest proposal is http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK action as: ... case PR_SET_TIMERSLACK: if (arg2 <= 0) current->timer_slack_ns = default_timer_slack_ns; else if (arg2 <= HRTIMER_MAX_SLACK) current->timer_slack_ns = arg2; else error = -EINVAL; break; ...
If we do make changes in this area then the prctl manpage should be updated, please. And if http://www.spinics.net/lists/linux-man/msg01149.html represents the current state of that manpage then it should be updated anyway - that entry doesn't say anything about the (arg2<= 0) case.
I sent a patch for man pages too, it should be one of the recent posts at http://www.spinics.net/lists/linux-man/index.html.
Dmitry
Dmitry,
On Fri, Apr 6, 2012 at 9:14 PM, Dmitry Antipov dmitry.antipov@linaro.org wrote:
On 04/05/2012 04:10 AM, Andrew Morton wrote:
Well.. there are some back-incompatibilities here. prctl(PR_SET_TIMERSLACK, -1) used to restore current's slack setting to whatever-we-inherited-at-fork, but that has been removed. What are the implications of this, and did we need to do it?
It seems you're looking at the previous version of this patch (http://lkml.org/lkml/2012/2/20/55). Latest proposal is http://lwn.net/Articles/484162/, which defines PR_SET_TIMERSLACK action as: ... case PR_SET_TIMERSLACK: if (arg2 <= 0) current->timer_slack_ns = default_timer_slack_ns; else if (arg2 <= HRTIMER_MAX_SLACK) current->timer_slack_ns = arg2; else error = -EINVAL; break; ...
If we do make changes in this area then the prctl manpage should be updated, please. And if http://www.spinics.net/lists/linux-man/msg01149.html represents the current state of that manpage then it should be updated anyway - that entry doesn't say anything about the (arg2<= 0) case.
I sent a patch for man pages too, it should be one of the recent posts at http://www.spinics.net/lists/linux-man/index.html.
Your response didn't actually address Andrew's point. Your patch changes user-visible semantics that have been in place since kernel 2.6.28. Specifically:
* The meaning of prctl(PS_SET_TIMESLACK, n) changes, for the n<0 case (formerly, this reverted the timer slack to the per-process "default", with the proposed patch, it reverts the timer slack to a system-wide default). * The semantics of setting the timer slack of a new thread have changed.
Perhaps these changes are warranted/necessary, but they *are* ABI changes, and so should be carefully explained and well justified.
Thanks,
Michael
PS As background to the discussion, here's the current draft of some text I plan to add to prctl(2) that explains the current semantics, which would change with Dmitry's patch:
prctl(2): PR_SET_TIMERSLACK (since Linux 2.6.28) Set the timer slack for the calling thread to the value in arg2. The timer slack is a value, expressed in nanoseconds, that is used by the kernel to group timer expirations for this thread that are close to one another; as a consequence, timer expirations for this thread may be up to the specified number of nanoseconds late (but will never expire early). Grouping timer expirations can help reduce system power con‐ sumption by minimizing CPU wake-ups.
The timer expirations affected by timer slack are those set by select(2), pselect(2), poll(2), ppoll(2), epoll_wait(2), epoll_pwait(2), clock_nanosleep(2), nanosleep(2), and futex(2) (and thus the library functions implemented via futexes: pthread_cond_timedwait(3), pthread_rwlock_timedrd‐ lock(3), pthread_rwlock_timedwrlock(3), and sem_wait(3)).
Each thread has two associated timer slack values: a "default" value, and a "current" value. The "current" value is the one that governs grouping of timer expirations. When a new thread is created, the two timer slack values are made the same as the "current" value of the creating thread. Thereafter, a thread can adjust its timer slack value via PR_SET_TIMERSLACK: if arg2 is greater than zero, then it specifies a new value for the "current" timer slack for the calling thread; if arg2 is less than or equal to zero, then the "current" timer slack is set to the "default" value. The timer slack value of init (PID 1), the ancestor of all threads, is 50,000 nanoseconds (50 microseconds).
fork(2): * The "default" timer slack of the child is set to the value of the "current" timer slack of the parent. (See the description of PR_SET_TIMERSLACK on prctl(2).)