With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com --- Changes in v3: - Use smp_load_acquire/smp_store_release to read/write trigger pointer, per Eric and Linus
Documentation/accounting/psi.rst | 3 +- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 -- kernel/cgroup/cgroup.c | 11 ++++-- kernel/sched/psi.c | 66 ++++++++++++++------------------ 5 files changed, 40 insertions(+), 45 deletions(-)
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index f2b3439edcc2..860fe651d645 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger for the same psi metric can be specified. However for each trigger a separate file descriptor is required to be able to poll it separately from others, therefore for each trigger a separate open() syscall should be made even -when opening the same psi interface file. +when opening the same psi interface file. Write operations to a file descriptor +with an already existing psi trigger will fail with EBUSY.
Monitors activate only when system enters stall state for the monitored psi metric and deactivates upon exit from the stall state. While system is diff --git a/include/linux/psi.h b/include/linux/psi.h index 65eb1476ac70..74f7148dfb9f 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -32,7 +32,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0a23300d49af..6537d0c92825 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -129,9 +129,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; };
struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cafb8c114a21..d18c2ef3180e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3642,6 +3642,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3649,8 +3655,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); }
- psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp);
return nbytes; @@ -3689,7 +3694,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv;
- psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); }
bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1652f2bb54b7..232b4c05eebc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1151,7 +1151,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
@@ -1180,15 +1179,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; }
-static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return;
+ group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1224,9 +1227,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock);
/* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1243,18 +1246,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); }
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1264,24 +1255,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy); - return ret; }
@@ -1305,14 +1287,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock);
return nbytes; @@ -1347,7 +1339,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); }
On Tue, Jan 11, 2022 at 03:23:09PM -0800, Suren Baghdasaryan wrote:
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com
Looks good,
Reviewed-by: Eric Biggers ebiggers@google.com
- Eric
On Tue, Jan 11, 2022 at 03:23:09PM -0800, Suren Baghdasaryan wrote:
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com
Thanks, I'll go stick this in sched/urgent unless Linus picks it up himself.
On Wed, Jan 12, 2022 at 2:04 AM Peter Zijlstra peterz@infradead.org wrote:
Thanks, I'll go stick this in sched/urgent unless Linus picks it up himself.
I'll let it go through the proper channels, it's not like a few days or whatever will make a difference.
Linus
On Tue, Jan 11, 2022 at 03:23:09PM -0800, Suren Baghdasaryan wrote:
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com
Acked-by: Johannes Weiner hannes@cmpxchg.org
)
On Wed, Jan 12, 2022 at 6:40 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Tue, Jan 11, 2022 at 03:23:09PM -0800, Suren Baghdasaryan wrote:
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com
Acked-by: Johannes Weiner hannes@cmpxchg.org
Hmm. kernel test robot notified me of new (which are not really new) warnings but I don't think this patch specifically introduced them:
kernel/sched/psi.c:1112:21: warning: no previous prototype for function 'psi_trigger_create' [-Wmissing-prototypes] struct psi_trigger *psi_trigger_create(struct psi_group *group, ^ kernel/sched/psi.c:1112:1: note: declare 'static' if the function is not intended to be used outside of this translation unit struct psi_trigger *psi_trigger_create(struct psi_group *group, ^ static
kernel/sched/psi.c:1182:6: warning: no previous prototype for function 'psi_trigger_destroy' [-Wmissing-prototypes]
void psi_trigger_destroy(struct psi_trigger *t) ^ kernel/sched/psi.c:1182:1: note: declare 'static' if the function is not intended to be used outside of this translation unit void psi_trigger_destroy(struct psi_trigger *t) ^ static kernel/sched/psi.c:1249:10: warning: no previous prototype for function 'psi_trigger_poll' [-Wmissing-prototypes] __poll_t psi_trigger_poll(void **trigger_ptr, ^ kernel/sched/psi.c:1249:1: note: declare 'static' if the function is not intended to be used outside of this translation unit __poll_t psi_trigger_poll(void **trigger_ptr, ^
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
struct psi_trigger *_psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { // original psi_trigger_create code }
#ifdef CONFIG_CGROUPS
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { return _psi_trigger_create(group, buf, nbytes, res); }
#else
static struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { return _psi_trigger_create(group, buf, nbytes, res); }
#endif
Two questions: 1. Is this even worth fixing? 2. If so, I would like to do that as a separate patch (these warnings are unrelated to the changes in this patch). Would that be ok? Thanks, Suren.
On Wed, Jan 12, 2022 at 9:43 AM Suren Baghdasaryan surenb@google.com wrote:
)
On Wed, Jan 12, 2022 at 6:40 AM Johannes Weiner hannes@cmpxchg.org wrote:
On Tue, Jan 11, 2022 at 03:23:09PM -0800, Suren Baghdasaryan wrote:
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Cc: stable@vger.kernel.org Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Analyzed-by: Eric Biggers ebiggers@kernel.org Suggested-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com
Acked-by: Johannes Weiner hannes@cmpxchg.org
Hmm. kernel test robot notified me of new (which are not really new) warnings but I don't think this patch specifically introduced them:
kernel/sched/psi.c:1112:21: warning: no previous prototype for function 'psi_trigger_create' [-Wmissing-prototypes] struct psi_trigger *psi_trigger_create(struct psi_group *group, ^ kernel/sched/psi.c:1112:1: note: declare 'static' if the function is not intended to be used outside of this translation unit struct psi_trigger *psi_trigger_create(struct psi_group *group, ^ static
kernel/sched/psi.c:1182:6: warning: no previous prototype for function 'psi_trigger_destroy' [-Wmissing-prototypes]
void psi_trigger_destroy(struct psi_trigger *t) ^ kernel/sched/psi.c:1182:1: note: declare 'static' if the function is not intended to be used outside of this translation unit void psi_trigger_destroy(struct psi_trigger *t) ^ static kernel/sched/psi.c:1249:10: warning: no previous prototype for function 'psi_trigger_poll' [-Wmissing-prototypes] __poll_t psi_trigger_poll(void **trigger_ptr, ^ kernel/sched/psi.c:1249:1: note: declare 'static' if the function is not intended to be used outside of this translation unit __poll_t psi_trigger_poll(void **trigger_ptr, ^
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
struct psi_trigger *_psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { // original psi_trigger_create code }
#ifdef CONFIG_CGROUPS
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { return _psi_trigger_create(group, buf, nbytes, res); }
#else
static struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { return _psi_trigger_create(group, buf, nbytes, res); }
#endif
Actually this would be enough:
static struct psi_trigger *_psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { // original psi_trigger_create code }
#ifdef CONFIG_CGROUPS struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { return _psi_trigger_create(group, buf, nbytes, res); } #endif
and locally we use _psi_trigger_create().
Two questions:
- Is this even worth fixing?
- If so, I would like to do that as a separate patch (these warnings
are unrelated to the changes in this patch). Would that be ok? Thanks, Suren.
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
On Wed, Jan 12, 2022 at 10:26:08AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
Why not simply move the declarations of psi_trigger_create() and psi_trigger_destroy() in include/linux/psi.h outside of the '#ifdef CONFIG_CGROUPS' block, to match the .c file?
They *could* be static when !CONFIG_CGROUPS, but IMO it's not worth bothering.
- Eric
On Wed, Jan 12, 2022 at 10:44 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:26:08AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
Why not simply move the declarations of psi_trigger_create() and psi_trigger_destroy() in include/linux/psi.h outside of the '#ifdef CONFIG_CGROUPS' block, to match the .c file?
IIRC this was done to avoid another warning that these functions are not used outside of psi.c when CONFIG_CGROUPS=n
They *could* be static when !CONFIG_CGROUPS, but IMO it's not worth bothering.
- Eric
On Wed, Jan 12, 2022 at 10:53:48AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:44 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:26:08AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
This happens with the following config:
CONFIG_CGROUPS=n CONFIG_PSI=y
With cgroups disabled these functions are defined as non-static but are not defined in the header (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) since the only external user cgroup.c is disabled. The cleanest way to fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
Why not simply move the declarations of psi_trigger_create() and psi_trigger_destroy() in include/linux/psi.h outside of the '#ifdef CONFIG_CGROUPS' block, to match the .c file?
IIRC this was done to avoid another warning that these functions are not used outside of psi.c when CONFIG_CGROUPS=n
What tool gave that warning?
- Eric
On Wed, Jan 12, 2022 at 11:04 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:53:48AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:44 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:26:08AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote:
> This happens with the following config: > > CONFIG_CGROUPS=n > CONFIG_PSI=y > > With cgroups disabled these functions are defined as non-static but > are not defined in the header > (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) > since the only external user cgroup.c is disabled. The cleanest way to > fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
Why not simply move the declarations of psi_trigger_create() and psi_trigger_destroy() in include/linux/psi.h outside of the '#ifdef CONFIG_CGROUPS' block, to match the .c file?
IIRC this was done to avoid another warning that these functions are not used outside of psi.c when CONFIG_CGROUPS=n
What tool gave that warning?
Let me double-check by building it. It has been a while since I developed the code and I don't want to mislead by making false claims.
- Eric
On Wed, Jan 12, 2022 at 11:06 AM Suren Baghdasaryan surenb@google.com wrote:
On Wed, Jan 12, 2022 at 11:04 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:53:48AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:44 AM Eric Biggers ebiggers@kernel.org wrote:
On Wed, Jan 12, 2022 at 10:26:08AM -0800, Suren Baghdasaryan wrote:
On Wed, Jan 12, 2022 at 10:16 AM Matthew Wilcox willy@infradead.org wrote:
On Wed, Jan 12, 2022 at 09:49:00AM -0800, Suren Baghdasaryan wrote: > > This happens with the following config: > > > > CONFIG_CGROUPS=n > > CONFIG_PSI=y > > > > With cgroups disabled these functions are defined as non-static but > > are not defined in the header > > (https://elixir.bootlin.com/linux/latest/source/include/linux/psi.h#L28) > > since the only external user cgroup.c is disabled. The cleanest way to > > fix these I think is by doing smth like this in psi.c:
A cleaner way to solve these is simply:
#ifndef CONFIG_CGROUPS static struct psi_trigger *psi_trigger_create(...); ... #endif
I tested this works:
$ cat foo5.c static int psi(void *);
int psi(void *x) { return (int)(long)x; }
int bar(void *x) { return psi(x); } $ gcc -W -Wall -O2 -c -o foo5.o foo5.c $ readelf -s foo5.o
Symbol table '.symtab' contains 4 entries: Num: Value Size Type Bind Vis Ndx Name 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND 1: 0000000000000000 0 FILE LOCAL DEFAULT ABS foo5.c 2: 0000000000000000 0 SECTION LOCAL DEFAULT 1 .text 3: 0000000000000000 3 FUNC GLOBAL DEFAULT 1 bar
Thanks Matthew! That looks much cleaner. I'll post a separate patch to fix these. My main concern was whether it's worth adding more code to satisfy this warning but with this approach the code changes are minimal, so I'll go ahead and post it shortly.
Why not simply move the declarations of psi_trigger_create() and psi_trigger_destroy() in include/linux/psi.h outside of the '#ifdef CONFIG_CGROUPS' block, to match the .c file?
IIRC this was done to avoid another warning that these functions are not used outside of psi.c when CONFIG_CGROUPS=n
What tool gave that warning?
Let me double-check by building it. It has been a while since I developed the code and I don't want to mislead by making false claims.
No warnings, so it was probably done to keep the scope of these functions as local as possible. I agree that moving them out of #ifdef CONFIG_CGROUPS in the header file makes sense here. The scope unnecessarily expands when CONFIG_CGROUPS=n but the code is simpler. Will do that then.
I noticed there is another warning about psi_cpu_proc_ops and similar structures being unused when CONFIG_PROC_FS=n. Looks like I'll need some more ifdefs to fix all these warnings.
- Eric
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: a06247c6804f1a7c86a2e5398a4c1f1db1471848 Gitweb: https://git.kernel.org/tip/a06247c6804f1a7c86a2e5398a4c1f1db1471848 Author: Suren Baghdasaryan surenb@google.com AuthorDate: Tue, 11 Jan 2022 15:23:09 -08:00 Committer: Peter Zijlstra peterz@infradead.org CommitterDate: Tue, 18 Jan 2022 12:09:57 +01:00
psi: Fix uaf issue when psi trigger is destroyed while being polled
With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak.
Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds torvalds@linux-foundation.org Analyzed-by: Eric Biggers ebiggers@kernel.org Signed-off-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Eric Biggers ebiggers@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com --- Documentation/accounting/psi.rst | 3 +- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 +- kernel/cgroup/cgroup.c | 11 +++-- kernel/sched/psi.c | 66 +++++++++++++------------------ 5 files changed, 40 insertions(+), 45 deletions(-)
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index f2b3439..860fe65 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger for the same psi metric can be specified. However for each trigger a separate file descriptor is required to be able to poll it separately from others, therefore for each trigger a separate open() syscall should be made even -when opening the same psi interface file. +when opening the same psi interface file. Write operations to a file descriptor +with an already existing psi trigger will fail with EBUSY.
Monitors activate only when system enters stall state for the monitored psi metric and deactivates upon exit from the stall state. While system is diff --git a/include/linux/psi.h b/include/linux/psi.h index a70ca83..f8ce53b 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);
struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 516c0fe..1a3cef2 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -141,9 +141,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; };
struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e146..9d05c3c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); }
- psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp);
return nbytes; @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv;
- psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); }
bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613..c137c4d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
@@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; }
-static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return;
+ group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock);
/* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); }
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy); - return ret; }
@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock);
return nbytes; @@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); }
linux-stable-mirror@lists.linaro.org