Hi all,
This small patch set fixes a bug in the vmevent core, plus makes the vmevent-test buildable w/o unneded SDL library.
Plus, we add a new 'cross' event type: the event will trigger whenever a value crosses a user-specified threshold. It works two-way, i.e. when a value crosses the threshold from a lesser values side to a greater values side, and vice versa.
We use the event type in an userspace low-memory killer: we get a notification when memory becomes low, so we start freeing memory by killing unneeded processes, and we get notification when memory hits the threshold from another side, so we know that we freed enough of memory.
The patches are against
git://github.com/penberg/linux.git vmevent/core
Thanks!
vmevent grabs a mutex in the atomic context, and so this pops up:
BUG: sleeping function called from invalid context at kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0 1 lock held by swapper/0/0: #0: (&watch->timer){+.-...}, at: [<ffffffff8103eb80>] call_timer_fn+0x0/0xf0 Pid: 0, comm: swapper/0 Not tainted 3.2.0+ #6 Call Trace: <IRQ> [<ffffffff8102f5da>] __might_sleep+0x12a/0x1e0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff81321f2c>] mutex_lock_nested+0x3c/0x340 [<ffffffff81064b33>] ? lock_acquire+0xa3/0xc0 [<ffffffff8103eb80>] ? internal_add_timer+0x110/0x110 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff810bda21>] vmevent_timer_fn+0x91/0xf0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff8103ebf5>] call_timer_fn+0x75/0xf0 [<ffffffff8103eb80>] ? internal_add_timer+0x110/0x110 [<ffffffff81062fdd>] ? trace_hardirqs_on_caller+0x7d/0x120 [<ffffffff8103ee9f>] run_timer_softirq+0x10f/0x1e0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff81038d90>] __do_softirq+0xb0/0x160 [<ffffffff8105eb0f>] ? tick_program_event+0x1f/0x30 [<ffffffff8132642c>] call_softirq+0x1c/0x26 [<ffffffff810036d5>] do_softirq+0x85/0xc0
This patch fixes the issue by removing the mutex and making the logic lock-free.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- mm/vmevent.c | 35 ++++++++++++----------------------- 1 files changed, 12 insertions(+), 23 deletions(-)
diff --git a/mm/vmevent.c b/mm/vmevent.c index 1847b56..a56174f 100644 --- a/mm/vmevent.c +++ b/mm/vmevent.c @@ -1,4 +1,5 @@ #include <linux/anon_inodes.h> +#include <linux/atomic.h> #include <linux/vmevent.h> #include <linux/syscalls.h> #include <linux/timer.h> @@ -23,7 +24,7 @@ struct vmevent_watch { struct vmevent_config config;
struct mutex mutex; - bool pending; + atomic_t pending;
/* * Attributes that are exported as part of delivered VM events. @@ -103,20 +104,18 @@ static void vmevent_sample(struct vmevent_watch *watch) { int i;
+ if (atomic_read(&watch->pending)) + return; if (!vmevent_match(watch)) return;
- mutex_lock(&watch->mutex); - - watch->pending = true; - for (i = 0; i < watch->nr_attrs; i++) { struct vmevent_attr *attr = &watch->sample_attrs[i];
attr->value = vmevent_sample_attr(watch, attr); }
- mutex_unlock(&watch->mutex); + atomic_set(&watch->pending, 1); }
static void vmevent_timer_fn(unsigned long data) @@ -125,7 +124,7 @@ static void vmevent_timer_fn(unsigned long data)
vmevent_sample(watch);
- if (watch->pending) + if (atomic_read(&watch->pending)) wake_up(&watch->waitq); mod_timer(&watch->timer, jiffies + nsecs_to_jiffies64(watch->config.sample_period_ns)); @@ -148,13 +147,9 @@ static unsigned int vmevent_poll(struct file *file, poll_table *wait)
poll_wait(file, &watch->waitq, wait);
- mutex_lock(&watch->mutex); - - if (watch->pending) + if (atomic_read(&watch->pending)) events |= POLLIN;
- mutex_unlock(&watch->mutex); - return events; }
@@ -171,15 +166,13 @@ static ssize_t vmevent_read(struct file *file, char __user *buf, size_t count, l if (count < size) return -EINVAL;
- mutex_lock(&watch->mutex); - - if (!watch->pending) - goto out_unlock; + if (!atomic_read(&watch->pending)) + goto out;
event = kmalloc(size, GFP_KERNEL); if (!event) { ret = -ENOMEM; - goto out_unlock; + goto out; }
for (i = 0; i < watch->nr_attrs; i++) { @@ -195,14 +188,10 @@ static ssize_t vmevent_read(struct file *file, char __user *buf, size_t count, l
ret = count;
- watch->pending = false; - + atomic_set(&watch->pending, 0); out_free: kfree(event); - -out_unlock: - mutex_unlock(&watch->mutex); - +out: return ret; }
On Mon, 2012-04-09 at 03:38 +0400, Anton Vorontsov wrote:
vmevent grabs a mutex in the atomic context, and so this pops up:
BUG: sleeping function called from invalid context at kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0 1 lock held by swapper/0/0: #0: (&watch->timer){+.-...}, at: [<ffffffff8103eb80>] call_timer_fn+0x0/0xf0 Pid: 0, comm: swapper/0 Not tainted 3.2.0+ #6 Call Trace: <IRQ> [<ffffffff8102f5da>] __might_sleep+0x12a/0x1e0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff81321f2c>] mutex_lock_nested+0x3c/0x340 [<ffffffff81064b33>] ? lock_acquire+0xa3/0xc0 [<ffffffff8103eb80>] ? internal_add_timer+0x110/0x110 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff810bda21>] vmevent_timer_fn+0x91/0xf0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff8103ebf5>] call_timer_fn+0x75/0xf0 [<ffffffff8103eb80>] ? internal_add_timer+0x110/0x110 [<ffffffff81062fdd>] ? trace_hardirqs_on_caller+0x7d/0x120 [<ffffffff8103ee9f>] run_timer_softirq+0x10f/0x1e0 [<ffffffff810bd990>] ? vmevent_match+0xe0/0xe0 [<ffffffff81038d90>] __do_softirq+0xb0/0x160 [<ffffffff8105eb0f>] ? tick_program_event+0x1f/0x30 [<ffffffff8132642c>] call_softirq+0x1c/0x26 [<ffffffff810036d5>] do_softirq+0x85/0xc0
This patch fixes the issue by removing the mutex and making the logic lock-free.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org
What guarantees that there's only one thread writing to struct vmevent_attr::value in vmevent_sample() now that the mutex is gone?
mm/vmevent.c | 35 ++++++++++++----------------------- 1 files changed, 12 insertions(+), 23 deletions(-)
diff --git a/mm/vmevent.c b/mm/vmevent.c index 1847b56..a56174f 100644 --- a/mm/vmevent.c +++ b/mm/vmevent.c @@ -1,4 +1,5 @@ #include <linux/anon_inodes.h> +#include <linux/atomic.h> #include <linux/vmevent.h> #include <linux/syscalls.h> #include <linux/timer.h> @@ -23,7 +24,7 @@ struct vmevent_watch { struct vmevent_config config; struct mutex mutex;
- bool pending;
- atomic_t pending;
/* * Attributes that are exported as part of delivered VM events. @@ -103,20 +104,18 @@ static void vmevent_sample(struct vmevent_watch *watch) { int i;
- if (atomic_read(&watch->pending))
if (!vmevent_match(watch)) return;return;
- mutex_lock(&watch->mutex);
- watch->pending = true;
- for (i = 0; i < watch->nr_attrs; i++) { struct vmevent_attr *attr = &watch->sample_attrs[i];
attr->value = vmevent_sample_attr(watch, attr); }
- mutex_unlock(&watch->mutex);
- atomic_set(&watch->pending, 1);
} static void vmevent_timer_fn(unsigned long data) @@ -125,7 +124,7 @@ static void vmevent_timer_fn(unsigned long data) vmevent_sample(watch);
- if (watch->pending)
- if (atomic_read(&watch->pending)) wake_up(&watch->waitq); mod_timer(&watch->timer, jiffies + nsecs_to_jiffies64(watch->config.sample_period_ns));
@@ -148,13 +147,9 @@ static unsigned int vmevent_poll(struct file *file, poll_table *wait) poll_wait(file, &watch->waitq, wait);
- mutex_lock(&watch->mutex);
- if (watch->pending)
- if (atomic_read(&watch->pending)) events |= POLLIN;
- mutex_unlock(&watch->mutex);
- return events;
} @@ -171,15 +166,13 @@ static ssize_t vmevent_read(struct file *file, char __user *buf, size_t count, l if (count < size) return -EINVAL;
- mutex_lock(&watch->mutex);
- if (!watch->pending)
goto out_unlock;
- if (!atomic_read(&watch->pending))
goto out;
event = kmalloc(size, GFP_KERNEL); if (!event) { ret = -ENOMEM;
goto out_unlock;
}goto out;
for (i = 0; i < watch->nr_attrs; i++) { @@ -195,14 +188,10 @@ static ssize_t vmevent_read(struct file *file, char __user *buf, size_t count, l ret = count;
- watch->pending = false;
- atomic_set(&watch->pending, 0);
out_free: kfree(event);
-out_unlock:
- mutex_unlock(&watch->mutex);
+out: return ret; }
On Mon, Apr 09, 2012 at 11:40:31AM +0300, Pekka Enberg wrote:
On Mon, 2012-04-09 at 03:38 +0400, Anton Vorontsov wrote:
vmevent grabs a mutex in the atomic context, and so this pops up:
BUG: sleeping function called from invalid context at kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0
[...]
This patch fixes the issue by removing the mutex and making the logic lock-free.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org
What guarantees that there's only one thread writing to struct vmevent_attr::value in vmevent_sample() now that the mutex is gone?
Well, it is called from the timer function, which has the same guaranties as an interrupt handler: it can have only one execution thread (unlike bare softirq handler), so we don't need to worry about racing w/ ourselves?
If you're concerned about several instances of timers accessing the same vmevent_watch, I don't really see how it is possible, as we allocate vmevent_watch together w/ the timer instance in vmevent_fd(), so there is always one timer per vmevent_watch.
Thanks,
On Mon, Apr 09, 2012 at 11:40:31AM +0300, Pekka Enberg wrote:
On Mon, 2012-04-09 at 03:38 +0400, Anton Vorontsov wrote:
vmevent grabs a mutex in the atomic context, and so this pops up:
BUG: sleeping function called from invalid context at kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 0, name: swapper/0
[...]
This patch fixes the issue by removing the mutex and making the logic lock-free.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org
What guarantees that there's only one thread writing to struct vmevent_attr::value in vmevent_sample() now that the mutex is gone?
On Mon, Apr 9, 2012 at 3:29 PM, Anton Vorontsov anton.vorontsov@linaro.org wrote:
Well, it is called from the timer function, which has the same guaranties as an interrupt handler: it can have only one execution thread (unlike bare softirq handler), so we don't need to worry about racing w/ ourselves?
If you're concerned about several instances of timers accessing the same vmevent_watch, I don't really see how it is possible, as we allocate vmevent_watch together w/ the timer instance in vmevent_fd(), so there is always one timer per vmevent_watch.
Makes sense. A big fat comment on top of vmevent_sample() explaining all this would be helpful... ;-)
Pekka
panacea:~/src/linux/linux-vmevent/tools/testing/vmevent$ make cc -O3 -g -std=gnu99 -Wcast-align -Wformat -Wformat-security -Wformat-y2k -Wshadow -Winit-self -Wpacked -Wredundant-decls -Wstrict-aliasing=3 -Wswitch-default -Wno-system-headers -Wundef -Wwrite-strings -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wdeclaration-after-statement -lSDL vmevent-test.c -o vmevent-test /usr/bin/ld: cannot find -lSDL collect2: ld returned 1 exit status make: *** [vmevent-test] Error 1
This patch fixes the issue.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- tools/testing/vmevent/Makefile | 1 - 1 files changed, 0 insertions(+), 1 deletions(-)
diff --git a/tools/testing/vmevent/Makefile b/tools/testing/vmevent/Makefile index 5b5505f..d14b5c9 100644 --- a/tools/testing/vmevent/Makefile +++ b/tools/testing/vmevent/Makefile @@ -20,7 +20,6 @@ WARNINGS += -Wstrict-prototypes WARNINGS += -Wdeclaration-after-statement
CFLAGS = -O3 -g -std=gnu99 $(WARNINGS) -LDFLAGS = -lSDL
PROGRAMS = vmevent-test
On Mon, 9 Apr 2012, Anton Vorontsov wrote:
panacea:~/src/linux/linux-vmevent/tools/testing/vmevent$ make cc -O3 -g -std=gnu99 -Wcast-align -Wformat -Wformat-security -Wformat-y2k -Wshadow -Winit-self -Wpacked -Wredundant-decls -Wstrict-aliasing=3 -Wswitch-default -Wno-system-headers -Wundef -Wwrite-strings -Wbad-function-cast -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wold-style-definition -Wstrict-prototypes -Wdeclaration-after-statement -lSDL vmevent-test.c -o vmevent-test /usr/bin/ld: cannot find -lSDL collect2: ld returned 1 exit status make: *** [vmevent-test] Error 1
This patch fixes the issue.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org
Applied, thanks!
This patch implements a new event type, it will trigger whenever a value crosses a user-specified threshold. It works two-way, i.e. when a value crosses the threshold from a lesser values side to a greater values side, and vice versa.
We use the event type in an userspace low-memory killer: we get a notification when memory becomes low, so we start freeing memory by killing unneeded processes, and we get notification when memory hits the threshold from another side, so we know that we freed enough of memory.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org --- include/linux/vmevent.h | 9 +++++++++ mm/vmevent.c | 21 +++++++++++++++++++++ tools/testing/vmevent/vmevent-test.c | 15 ++++++++++----- 3 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h index 64357e4..00cc04f 100644 --- a/include/linux/vmevent.h +++ b/include/linux/vmevent.h @@ -22,6 +22,15 @@ enum { * Sample value is less than user-specified value */ VMEVENT_ATTR_STATE_VALUE_LT = (1UL << 0), + /* + * Sample value crossed user-specified value + */ + VMEVENT_ATTR_STATE_VALUE_CROSS = (1UL << 2), + + /* Last saved state, used internally by the kernel. */ + __VMEVENT_ATTR_STATE_LAST = (1UL << 30), + /* Not first sample, used internally by the kernel. */ + __VMEVENT_ATTR_STATE_NFIRST = (1UL << 31), };
struct vmevent_attr { diff --git a/mm/vmevent.c b/mm/vmevent.c index a56174f..f8fd2d6 100644 --- a/mm/vmevent.c +++ b/mm/vmevent.c @@ -1,5 +1,6 @@ #include <linux/anon_inodes.h> #include <linux/atomic.h> +#include <linux/compiler.h> #include <linux/vmevent.h> #include <linux/syscalls.h> #include <linux/timer.h> @@ -94,6 +95,26 @@ static bool vmevent_match(struct vmevent_watch *watch) if (attr->state & VMEVENT_ATTR_STATE_VALUE_LT) { if (value < attr->value) return true; + } else if (attr->state & VMEVENT_ATTR_STATE_VALUE_CROSS) { + bool fst = !(attr->state & __VMEVENT_ATTR_STATE_NFIRST); + bool old = attr->state & __VMEVENT_ATTR_STATE_LAST; + bool new = value < attr->value; + bool chg = old ^ new; + bool ret = chg; + + /* + * This is not 'lt' or 'gt' match, so on the first + * sample assume we crossed the threshold. + */ + if (unlikely(fst)) { + attr->state |= __VMEVENT_ATTR_STATE_NFIRST; + ret = true; + } + + attr->state &= ~__VMEVENT_ATTR_STATE_LAST; + attr->state |= new ? __VMEVENT_ATTR_STATE_LAST : 0; + + return ret; } }
diff --git a/tools/testing/vmevent/vmevent-test.c b/tools/testing/vmevent/vmevent-test.c index 534f827..39e93af 100644 --- a/tools/testing/vmevent/vmevent-test.c +++ b/tools/testing/vmevent/vmevent-test.c @@ -33,20 +33,25 @@ int main(int argc, char *argv[])
config = (struct vmevent_config) { .sample_period_ns = 1000000000L, - .counter = 4, + .counter = 5, .attrs = { - [0] = { + { + .type = VMEVENT_ATTR_NR_FREE_PAGES, + .state = VMEVENT_ATTR_STATE_VALUE_CROSS, + .value = phys_pages / 2, + }, + { .type = VMEVENT_ATTR_NR_FREE_PAGES, .state = VMEVENT_ATTR_STATE_VALUE_LT, .value = phys_pages, }, - [1] = { + { .type = VMEVENT_ATTR_NR_AVAIL_PAGES, }, - [2] = { + { .type = VMEVENT_ATTR_NR_SWAP_PAGES, }, - [3] = { + { .type = 0xffff, /* invalid */ }, },
On Mon, Apr 9, 2012 at 2:38 AM, Anton Vorontsov anton.vorontsov@linaro.org wrote:
This patch implements a new event type, it will trigger whenever a value crosses a user-specified threshold. It works two-way, i.e. when a value crosses the threshold from a lesser values side to a greater values side, and vice versa.
We use the event type in an userspace low-memory killer: we get a notification when memory becomes low, so we start freeing memory by killing unneeded processes, and we get notification when memory hits the threshold from another side, so we know that we freed enough of memory.
Signed-off-by: Anton Vorontsov anton.vorontsov@linaro.org
include/linux/vmevent.h | 9 +++++++++ mm/vmevent.c | 21 +++++++++++++++++++++ tools/testing/vmevent/vmevent-test.c | 15 ++++++++++----- 3 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/include/linux/vmevent.h b/include/linux/vmevent.h index 64357e4..00cc04f 100644 --- a/include/linux/vmevent.h +++ b/include/linux/vmevent.h @@ -22,6 +22,15 @@ enum { * Sample value is less than user-specified value */ VMEVENT_ATTR_STATE_VALUE_LT = (1UL << 0),
- /*
- * Sample value crossed user-specified value
- */
- VMEVENT_ATTR_STATE_VALUE_CROSS = (1UL << 2),
- /* Last saved state, used internally by the kernel. */
- __VMEVENT_ATTR_STATE_LAST = (1UL << 30),
- /* Not first sample, used internally by the kernel. */
- __VMEVENT_ATTR_STATE_NFIRST = (1UL << 31),
};
struct vmevent_attr { diff --git a/mm/vmevent.c b/mm/vmevent.c index a56174f..f8fd2d6 100644 --- a/mm/vmevent.c +++ b/mm/vmevent.c @@ -1,5 +1,6 @@ #include <linux/anon_inodes.h> #include <linux/atomic.h> +#include <linux/compiler.h> #include <linux/vmevent.h> #include <linux/syscalls.h> #include <linux/timer.h> @@ -94,6 +95,26 @@ static bool vmevent_match(struct vmevent_watch *watch) if (attr->state & VMEVENT_ATTR_STATE_VALUE_LT) { if (value < attr->value) return true;
- } else if (attr->state & VMEVENT_ATTR_STATE_VALUE_CROSS) {
- bool fst = !(attr->state & __VMEVENT_ATTR_STATE_NFIRST);
- bool old = attr->state & __VMEVENT_ATTR_STATE_LAST;
- bool new = value < attr->value;
- bool chg = old ^ new;
- bool ret = chg;
- /*
- * This is not 'lt' or 'gt' match, so on the first
- * sample assume we crossed the threshold.
- */
- if (unlikely(fst)) {
- attr->state |= __VMEVENT_ATTR_STATE_NFIRST;
- ret = true;
- }
- attr->state &= ~__VMEVENT_ATTR_STATE_LAST;
- attr->state |= new ? __VMEVENT_ATTR_STATE_LAST : 0;
- return ret;
} }
Can't we implement this by specifying both VMEVENT_ATTR_STATE_VALUE_LT and VMEVENT_ATTR_STATE_VALUE_GT in userspace? I assume the problem with current approach is that you get more than one notifications, right? We can implement a "single-shot" flag to deal with that.
linaro-kernel@lists.linaro.org