Instead of computing the new cred before we pass the point of no return compute the new cred just before we use it.
This allows the removal of fs_struct->in_exec and cred_guard_mutex.
I am not certain why we wanted to compute the cred for the new executable so early. Perhaps I missed something but I did not see any common errors being signaled. So I don't think we loose anything by computing the new cred later.
We gain a lot.
We stop holding the cred_guard_mutex over places where the code sleeps and waits for userspace. These places include the waiting for the tracer in PTRACE_EVENT_EXIT, "put_user(0, tsk->clear_child_tid)" in mm_release, and "get_user(futex_offset, ...") in exit_robust_mutex.
We can remove fs_struct->in_exec. The case where it was used simply never comes up, when we compute the cred after de_thread completes.
We remove the possibility of a hang between a tracer calling PTRACE_ATTACH/PTRACE_SIEZE and the kernel waiting for the tracer in PTRACE_EVENT_EXIT.
--- Oleg, Kees, Bernd, Can you see anything I am missing?
The code compiles but I haven't test it yet.
I thought I was going to move commit_creds before de_thread, but that would have taken commit_cred out of exec_update_lock (which introduces races).
However I can't see any drawbacks of going the other direction.
fs/exec.c | 88 ++++++++++++++---------------------- fs/fs_struct.c | 1 - fs/proc/base.c | 4 +- include/linux/fs_struct.h | 1 - include/linux/sched/signal.h | 6 --- init/init_task.c | 1 - kernel/cred.c | 2 +- kernel/fork.c | 8 +--- kernel/ptrace.c | 4 +- kernel/seccomp.c | 12 ++--- 10 files changed, 45 insertions(+), 82 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c index 4298e7e08d5d..5ae96584dab0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1090,6 +1090,9 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) perf_event_comm(tsk, exec); }
+static int prepare_bprm_creds(struct linux_binprm *bprm); +static void check_unsafe_exec(struct linux_binprm *bprm); + /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal @@ -1101,10 +1104,6 @@ int begin_new_exec(struct linux_binprm * bprm) struct task_struct *me = current; int retval;
- /* Once we are committed compute the creds */ - retval = bprm_creds_from_file(bprm); - if (retval) - return retval;
/* * This tracepoint marks the point before flushing the old exec where @@ -1123,8 +1122,6 @@ int begin_new_exec(struct linux_binprm * bprm) retval = de_thread(me); if (retval) goto out; - /* see the comment in check_unsafe_exec() */ - current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1251,6 +1248,25 @@ int begin_new_exec(struct linux_binprm * bprm) WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0);
+ retval = prepare_bprm_creds(bprm); + if (retval) + goto out_unlock; + + /* + * Check for unsafe execution states before exec_binprm(), which + * will call back into begin_new_exec(), into bprm_creds_from_file(), + * where setuid-ness is evaluated. + */ + check_unsafe_exec(bprm); + + /* Set the unchanging part of bprm->cred */ + retval = security_bprm_creds_for_exec(bprm); + + /* Once we are committed compute the creds */ + retval = bprm_creds_from_file(bprm); + if (retval) + goto out_unlock; + retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; @@ -1272,9 +1288,9 @@ int begin_new_exec(struct linux_binprm * bprm) if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* - * cred_guard_mutex must be held at least to this point to prevent + * exec_update_lock must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked. + * credentials. */ security_bprm_committed_creds(bprm);
@@ -1291,8 +1307,6 @@ int begin_new_exec(struct linux_binprm * bprm)
out_unlock: up_write(&me->signal->exec_update_lock); - if (!bprm->cred) - mutex_unlock(&me->signal->cred_guard_mutex);
out: return retval; @@ -1336,7 +1350,6 @@ void setup_new_exec(struct linux_binprm * bprm) */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); - mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec);
@@ -1351,21 +1364,15 @@ void finalize_exec(struct linux_binprm *bprm) EXPORT_SYMBOL(finalize_exec);
/* - * Prepare credentials and lock ->cred_guard_mutex. - * setup_new_exec() commits the new creds and drops the lock. - * Or, if exec fails before, free_bprm() should release ->cred - * and unlock. + * Prepare credentials. begin_new_exec() commits the new creds. + * Or, if exec fails before, free_bprm() should release ->cred. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { - if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) - return -ERESTARTNOINTR; - bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0;
- mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; }
@@ -1386,9 +1393,7 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { - /* in case exec fails before de_thread() succeeds */ - current->fs->in_exec = 0; - mutex_unlock(¤t->signal->cred_guard_mutex); + /* in case exec fails before commit_creds succeeds */ abort_creds(bprm->cred); } do_close_execat(bprm->file); @@ -1486,13 +1491,12 @@ EXPORT_SYMBOL(bprm_change_interp);
/* * determine how safe it is to execute the proposed program - * - the caller must hold ->cred_guard_mutex to protect against + * - the caller must hold ->exec_update_lock to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { - struct task_struct *p = current, *t; - unsigned n_fs; + struct task_struct *p = current;
if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; @@ -1509,25 +1513,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... - * - * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) - * from another sub-thread until de_thread() succeeds, this - * state is protected by cred_guard_mutex we hold. */ - n_fs = 1; - read_seqlock_excl(&p->fs->seq); - rcu_read_lock(); - for_other_threads(p, t) { - if (t->fs == p->fs) - n_fs++; - } - rcu_read_unlock(); - - /* "users" and "in_exec" locked for copy_fs() */ - if (p->fs->users > n_fs) + if (p->fs->users > 1) bprm->unsafe |= LSM_UNSAFE_SHARE; - else - p->fs->in_exec = 1; read_sequnlock_excl(&p->fs->seq); }
@@ -1731,25 +1719,15 @@ static int bprm_execve(struct linux_binprm *bprm) { int retval;
- retval = prepare_bprm_creds(bprm); - if (retval) - return retval; + if (bprm->is_check) + return 0;
- /* - * Check for unsafe execution states before exec_binprm(), which - * will call back into begin_new_exec(), into bprm_creds_from_file(), - * where setuid-ness is evaluated. - */ - check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current);
sched_exec();
- /* Set the unchanging part of bprm->cred */ - retval = security_bprm_creds_for_exec(bprm); - if (retval || bprm->is_check) - goto out; +
retval = exec_binprm(bprm); if (retval < 0) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 28be762ac1c6..945bc0916f65 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -109,7 +109,6 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; - fs->in_exec = 0; seqlock_init(&fs->seq); fs->umask = old->umask;
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97..7041fb4d1689 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2834,14 +2834,14 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, }
/* Guard against adverse ptrace interaction */ - rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + rv = down_write_killable(¤t->signal->exec_update_lock); if (rv < 0) goto out_free;
rv = security_setprocattr(PROC_I(inode)->op.lsmid, file->f_path.dentry->d_name.name, page, count); - mutex_unlock(¤t->signal->cred_guard_mutex); + up_write(¤t->signal->exec_update_lock); out_free: kfree(page); out: diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index baf200ab5c77..29d0f7d57743 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -10,7 +10,6 @@ struct fs_struct { int users; seqlock_t seq; int umask; - int in_exec; struct path root, pwd; } __randomize_layout;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 7d6449982822..7e9259c8fb2b 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -241,12 +241,6 @@ struct signal_struct { struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */
- struct mutex cred_guard_mutex; /* guard against foreign influences on - * credential calculations - * (notably. ptrace) - * Deprecated do not use in new code. - * Use exec_update_lock instead. - */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent diff --git a/init/init_task.c b/init/init_task.c index a55e2189206f..4813bffe217e 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -30,7 +30,6 @@ static struct signal_struct init_signals = { #ifdef CONFIG_CGROUPS .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), #endif - .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = HLIST_HEAD_INIT, diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..80e376ce005f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -252,7 +252,7 @@ EXPORT_SYMBOL(prepare_creds);
/* * Prepare credentials for current to perform an execve() - * - The caller must hold ->cred_guard_mutex + * - The caller must hold ->exec_update_lock */ struct cred *prepare_exec_creds(void) { diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..996c649b9a4c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1555,11 +1555,6 @@ static int copy_fs(u64 clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ read_seqlock_excl(&fs->seq); - /* "users" and "in_exec" locked for check_unsafe_exec() */ - if (fs->in_exec) { - read_sequnlock_excl(&fs->seq); - return -EAGAIN; - } fs->users++; read_sequnlock_excl(&fs->seq); return 0; @@ -1699,7 +1694,6 @@ static int copy_signal(u64 clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min;
- mutex_init(&sig->cred_guard_mutex); init_rwsem(&sig->exec_update_lock);
return 0; @@ -1710,7 +1704,7 @@ static void copy_seccomp(struct task_struct *p) #ifdef CONFIG_SECCOMP /* * Must be called with sighand->lock held, which is common to - * all threads in the group. Holding cred_guard_mutex is not + * all threads in the group. Holding exec_update_lock is not * needed because this new task is not yet running and cannot * be racing exec. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..8140d4bfc279 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -444,8 +444,8 @@ static int ptrace_attach(struct task_struct *task, long request, * SUID, SGID and LSM creds get determined differently * under ptrace. */ - scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, - &task->signal->cred_guard_mutex) { + scoped_cond_guard (rwsem_read_intr, return -ERESTARTNOINTR, + &task->signal->exec_update_lock) {
scoped_guard (task_lock, task) { retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 25f62867a16d..87de8d47d876 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -479,7 +479,7 @@ static int is_ancestor(struct seccomp_filter *parent, /** * seccomp_can_sync_threads: checks if all threads can be synchronized * - * Expects sighand and cred_guard_mutex locks to be held. + * Expects sighand and exec_update_lock locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was * either not in the correct seccomp mode or did not have an ancestral @@ -489,7 +489,7 @@ static inline pid_t seccomp_can_sync_threads(void) { struct task_struct *thread, *caller;
- BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!rwsem_is_locked(¤t->signal->exec_update_lock)); assert_spin_locked(¤t->sighand->siglock);
/* Validate all threads being eligible for synchronization. */ @@ -590,7 +590,7 @@ void seccomp_filter_release(struct task_struct *tsk) * * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. * - * Expects sighand and cred_guard_mutex locks to be held, and for + * Expects sighand and exec_update_lock locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. * @@ -599,7 +599,7 @@ static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller;
- BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); + BUG_ON(!rwsem_is_locked(¤t->signal->exec_update_lock)); assert_spin_locked(¤t->sighand->siglock);
/* @@ -2011,7 +2011,7 @@ static long seccomp_set_mode_filter(unsigned int flags, * while another thread is in the middle of calling exec. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC && - mutex_lock_killable(¤t->signal->cred_guard_mutex)) + down_read_killable(¤t->signal->exec_update_lock)) goto out_put_fd;
spin_lock_irq(¤t->sighand->siglock); @@ -2034,7 +2034,7 @@ static long seccomp_set_mode_filter(unsigned int flags, out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) - mutex_unlock(¤t->signal->cred_guard_mutex); + up_read(¤t->signal->exec_update_lock); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret) {