On 6/10/21 9:31 AM, Bernd Edlinger wrote:
This introduces signal->unsafe_execve_in_progress, which is used to fix the case when at least one of the sibling threads is traced, and therefore the trace process may dead-lock in ptrace_attach, but de_thread will need to wait for the tracer to continue execution.
The solution is to detect this situation and allow ptrace_attach to continue, while de_thread() is still waiting for traced zombies to be eventually released. When the current thread changed the ptrace status from non-traced to traced, we can simply abort the whole execve and restart it by returning -ERESTARTSYS. This needs to be done before changing the thread leader, because the PTRACE_EVENT_EXEC needs to know the old thread pid.
Although it is technically after the point of no return, we just have to reset bprm->point_of_no_return here, since at this time only the other threads have received a fatal signal, not the current thread.
From the user's point of view the whole execve was simply delayed until after the ptrace_attach.
Other threads die quickly since the cred_guard_mutex is released, but a deadly signal is already pending. In case the mutex_lock_killable misses the signal, ->unsafe_execve_in_progress makes sure they release the mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.de...
See tools/testing/selftests/ptrace/vmaccess.c for a test case that gets fixed by this change.
Note that since the test case was originally designed to test the ptrace_attach returning an error in this situation, the test expectation needed to be adjusted, to allow the API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger bernd.edlinger@hotmail.de
fs/exec.c | 45 ++++++++++++++++++++++++++----- fs/proc/base.c | 6 +++++ include/linux/sched/signal.h | 13 +++++++++ kernel/ptrace.c | 9 +++++++ kernel/seccomp.c | 12 ++++++--- tools/testing/selftests/ptrace/vmaccess.c | 25 +++++++++++------ 6 files changed, 92 insertions(+), 18 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c index 8344fba..ac3fec1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1040,6 +1040,8 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock;
- unsigned int prev_ptrace = tsk->ptrace;
- struct task_struct *t = tsk;
if (thread_group_empty(tsk)) goto no_thread_group; @@ -1057,20 +1059,40 @@ static int de_thread(struct task_struct *tsk) return -EAGAIN; }
- while_each_thread(tsk, t) {
if (unlikely(t->ptrace) && t != tsk->group_leader)
sig->unsafe_execve_in_progress = true;
- }
- sig->group_exit_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--;
- spin_unlock_irq(lock);
- while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
- if (unlikely(sig->unsafe_execve_in_progress))
mutex_unlock(&sig->cred_guard_mutex);
- for (;;) {
set_current_state(TASK_KILLABLE);
if (!sig->notify_count)
schedule(); if (__fatal_signal_pending(tsk)) goto killed;break;
}spin_lock_irq(lock);
- spin_unlock_irq(lock);
- __set_current_state(TASK_RUNNING);
Oh, sorry, I think I'll need to keep this spin_lock here, because otherwise the assignment sig->group_exit_task = NULL below will race with
kernel/exit.c (__exit_signal): if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task);
which runs under spin_lock(&sighand->siglock) and tasklist_lock write-locked.
Will send an updaten the patch later today.
Bernd.
- if (unlikely(sig->unsafe_execve_in_progress)) {
if (mutex_lock_killable(&sig->cred_guard_mutex))
goto killed;
sig->unsafe_execve_in_progress = false;
if (!prev_ptrace && tsk->ptrace) {
sig->group_exit_task = NULL;
sig->notify_count = 0;
return -ERESTARTSYS;
}
- }
/* * At this point all other threads have exited, all we have to @@ -1255,8 +1277,11 @@ int begin_new_exec(struct linux_binprm * bprm) * Make this the only thread in the thread group. */ retval = de_thread(me);
- if (retval)
- if (retval) {
if (retval == -ERESTARTSYS)
goto out;bprm->point_of_no_return = false;
- }
/* * Cancel any io_uring activity across execve @@ -1466,6 +1491,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm) if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
- if (unlikely(current->signal->unsafe_execve_in_progress)) {
mutex_unlock(¤t->signal->cred_guard_mutex);
return -ERESTARTNOINTR;
- }
- bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0;
@@ -1482,7 +1512,8 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) {
mutex_unlock(¤t->signal->cred_guard_mutex);
if (!current->signal->unsafe_execve_in_progress)
abort_creds(bprm->cred); } if (bprm->file) {mutex_unlock(¤t->signal->cred_guard_mutex);
diff --git a/fs/proc/base.c b/fs/proc/base.c index 3851bfc..3b2a55c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2739,6 +2739,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free;
- if (unlikely(current->signal->unsafe_execve_in_progress)) {
mutex_unlock(¤t->signal->cred_guard_mutex);
rv = -ERESTARTNOINTR;
goto out_free;
- }
- rv = security_setprocattr(PROC_I(inode)->op.lsm, file->f_path.dentry->d_name.name, page, count);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3f6a0fc..220a083 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -214,6 +214,17 @@ struct signal_struct { #endif /*
* Set while execve is executing but is *not* holding
* cred_guard_mutex to avoid possible dead-locks.
* The cred_guard_mutex is released *after* de_thread() has
* called zap_other_threads(), therefore a fatal signal is
* guaranteed to be already pending in the unlikely event, that
* current->signal->unsafe_execve_in_progress happens to be
* true after the cred_guard_mutex was acquired.
*/
- bool unsafe_execve_in_progress;
- /*
*/
- Thread is the potential origin of an oom condition; kill first on
- oom
@@ -227,6 +238,8 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace)
* Held while execve runs, except when
* a sibling thread is being traced. * Deprecated do not use in new code. * Use exec_update_lock instead. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61db50f..0cbc1eb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -468,6 +468,14 @@ static int ptrace_traceme(void) { int ret = -EPERM;
- if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
- if (unlikely(current->signal->unsafe_execve_in_progress)) {
mutex_unlock(¤t->signal->cred_guard_mutex);
return -ERESTARTNOINTR;
- }
- write_lock_irq(&tasklist_lock); /* Are we already being traced? */ if (!current->ptrace) {
@@ -483,6 +491,7 @@ static int ptrace_traceme(void) } } write_unlock_irq(&tasklist_lock);
- mutex_unlock(¤t->signal->cred_guard_mutex);
return ret; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1d60fc2..b1389ee 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1824,9 +1824,15 @@ static long seccomp_set_mode_filter(unsigned int flags, * Make sure we cannot change seccomp or nnp state via TSYNC * while another thread is in the middle of calling exec. */
- if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
mutex_lock_killable(¤t->signal->cred_guard_mutex))
goto out_put_fd;
- if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
goto out_put_fd;
if (unlikely(current->signal->unsafe_execve_in_progress)) {
mutex_unlock(¤t->signal->cred_guard_mutex);
goto out_put_fd;
}
- }
spin_lock_irq(¤t->sighand->siglock); diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c index 4db327b..c7c2242 100644 --- a/tools/testing/selftests/ptrace/vmaccess.c +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -39,8 +39,15 @@ static void *thread(void *arg) f = open(mm, O_RDONLY); ASSERT_GE(f, 0); close(f);
- f = kill(pid, SIGCONT);
- ASSERT_EQ(f, 0);
- f = waitpid(-1, NULL, 0);
- ASSERT_NE(f, -1);
- ASSERT_NE(f, 0);
- ASSERT_NE(f, pid);
- f = waitpid(-1, NULL, 0);
- ASSERT_EQ(f, pid);
- f = waitpid(-1, NULL, 0);
- ASSERT_EQ(f, -1);
- ASSERT_EQ(errno, ECHILD);
} TEST(attach) @@ -57,22 +64,24 @@ static void *thread(void *arg) sleep(1); k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
- ASSERT_EQ(errno, EAGAIN);
- ASSERT_EQ(k, -1);
- ASSERT_EQ(k, 0); k = waitpid(-1, &s, WNOHANG); ASSERT_NE(k, -1); ASSERT_NE(k, 0); ASSERT_NE(k, pid); ASSERT_EQ(WIFEXITED(s), 1); ASSERT_EQ(WEXITSTATUS(s), 0);
- sleep(1);
- k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
- ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid); ASSERT_EQ(WIFSTOPPED(s), 1); ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
- k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
- k = ptrace(PTRACE_CONT, pid, 0L, 0L);
- ASSERT_EQ(k, 0);
- k = waitpid(-1, &s, 0);
- ASSERT_EQ(k, pid);
- ASSERT_EQ(WIFSTOPPED(s), 1);
- ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
- k = ptrace(PTRACE_CONT, pid, 0L, 0L); ASSERT_EQ(k, 0); k = waitpid(-1, &s, 0); ASSERT_EQ(k, pid);