[PATCH v17] exec: Fix dead-lock in de_thread with ptrace_attach
Peter Zijlstra
peterz at infradead.org
Mon Oct 27 12:06:57 UTC 2025
On Thu, Aug 21, 2025 at 07:34:58PM +0200, Bernd Edlinger wrote:
> The solution is to detect this situation and allow
> ptrace_attach to continue by temporarily releasing the
> cred_guard_mutex, while de_thread() is still waiting for
> traced zombies to be eventually released by the tracer.
> In the case of the thread group leader we only have to wait
> for the thread to become a zombie, which may also need
> co-operation from the tracer due to PTRACE_O_TRACEEXIT.
>
> When a tracer wants to ptrace_attach a task that already
> is in execve, we simply retry the ptrace_may_access
> check while temporarily installing the new credentials
> and dumpability which are about to be used after execve
> completes. If the ptrace_attach happens on a thread that
> is a sibling-thread of the thread doing execve, it is
> sufficient to check against the old credentials, as this
> thread will be waited for, before the new credentials are
> installed.
>
> Other threads die quickly since the cred_guard_mutex is
> released, but a deadly signal is already pending. In case
> the mutex_lock_killable misses the signal, the non-zero
> current->signal->exec_bprm makes sure they release the
> mutex immediately and return with -ERESTARTNOINTR.
> diff --git a/fs/exec.c b/fs/exec.c
> index 2a1e5e4042a1..31c6ceaa5f69 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -905,11 +905,13 @@ static int exec_mmap(struct mm_struct *mm)
> return 0;
> }
>
> -static int de_thread(struct task_struct *tsk)
> +static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm)
> {
> struct signal_struct *sig = tsk->signal;
> struct sighand_struct *oldsighand = tsk->sighand;
> spinlock_t *lock = &oldsighand->siglock;
> + struct task_struct *t;
> + bool unsafe_execve_in_progress = false;
>
> if (thread_group_empty(tsk))
> goto no_thread_group;
> @@ -932,6 +934,19 @@ static int de_thread(struct task_struct *tsk)
> if (!thread_group_leader(tsk))
> sig->notify_count--;
>
> + for_other_threads(tsk, t) {
> + if (unlikely(t->ptrace)
> + && (t != tsk->group_leader || !t->exit_state))
&& goes at the end of the previous line
> + unsafe_execve_in_progress = true;
> + }
> +
> + if (unlikely(unsafe_execve_in_progress)) {
> + spin_unlock_irq(lock);
> + sig->exec_bprm = bprm;
> + mutex_unlock(&sig->cred_guard_mutex);
> + spin_lock_irq(lock);
I'm not clear why we need to drop and re-acquire siglock here.
And I would like a very large comment here explaining why it is safe to
drop cred_guard_mutex here.
> + }
> +
> while (sig->notify_count) {
> __set_current_state(TASK_KILLABLE);
> spin_unlock_irq(lock);
> @@ -1021,6 +1036,11 @@ static int de_thread(struct task_struct *tsk)
> release_task(leader);
> }
>
> + if (unlikely(unsafe_execve_in_progress)) {
> + mutex_lock(&sig->cred_guard_mutex);
> + sig->exec_bprm = NULL;
> + }
> +
> sig->group_exec_task = NULL;
> sig->notify_count = 0;
>
> @@ -1032,6 +1052,11 @@ static int de_thread(struct task_struct *tsk)
> return 0;
>
> killed:
> + if (unlikely(unsafe_execve_in_progress)) {
> + mutex_lock(&sig->cred_guard_mutex);
> + sig->exec_bprm = NULL;
> + }
> +
> /* protects against exit_notify() and __exit_signal() */
> read_lock(&tasklist_lock);
> sig->group_exec_task = NULL;
> @@ -1114,13 +1139,31 @@ int begin_new_exec(struct linux_binprm * bprm)
> */
> trace_sched_prepare_exec(current, bprm);
>
> + /* If the binary is not readable then enforce mm->dumpable=0 */
> + would_dump(bprm, bprm->file);
> + if (bprm->have_execfd)
> + would_dump(bprm, bprm->executable);
> +
> + /*
> + * Figure out dumpability. Note that this checking only of current
> + * is wrong, but userspace depends on it. This should be testing
> + * bprm->secureexec instead.
> + */
> + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
> + is_dumpability_changed(current_cred(), bprm->cred) ||
> + !(uid_eq(current_euid(), current_uid()) &&
> + gid_eq(current_egid(), current_gid())))
> + set_dumpable(bprm->mm, suid_dumpable);
> + else
> + set_dumpable(bprm->mm, SUID_DUMP_USER);
> +
I feel like moving this dumpable stuff around could be a separate patch.
Which can explain how that is correct and why it is needed and all that.
> /*
> * Ensure all future errors are fatal.
> */
> bprm->point_of_no_return = true;
>
> /* Make this the only thread in the thread group */
> - retval = de_thread(me);
> + retval = de_thread(me, bprm);
> if (retval)
> goto out;
> /* see the comment in check_unsafe_exec() */
> @@ -1144,11 +1187,6 @@ int begin_new_exec(struct linux_binprm * bprm)
> if (retval)
> goto out;
>
> - /* If the binary is not readable then enforce mm->dumpable=0 */
> - would_dump(bprm, bprm->file);
> - if (bprm->have_execfd)
> - would_dump(bprm, bprm->executable);
> -
> /*
> * Release all of the old mmap stuff
> */
> @@ -1210,18 +1248,6 @@ int begin_new_exec(struct linux_binprm * bprm)
>
> me->sas_ss_sp = me->sas_ss_size = 0;
>
> - /*
> - * Figure out dumpability. Note that this checking only of current
> - * is wrong, but userspace depends on it. This should be testing
> - * bprm->secureexec instead.
> - */
> - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
> - !(uid_eq(current_euid(), current_uid()) &&
> - gid_eq(current_egid(), current_gid())))
> - set_dumpable(current->mm, suid_dumpable);
> - else
> - set_dumpable(current->mm, SUID_DUMP_USER);
> -
> perf_event_exec();
>
> /*
> @@ -1361,6 +1387,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
> if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
> return -ERESTARTNOINTR;
>
> + if (unlikely(current->signal->exec_bprm)) {
> + mutex_unlock(¤t->signal->cred_guard_mutex);
> + return -ERESTARTNOINTR;
> + }
#1
> +
> bprm->cred = prepare_exec_creds();
> if (likely(bprm->cred))
> return 0;
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 62d35631ba8c..e5bcf812cee0 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2838,6 +2838,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
> if (rv < 0)
> goto out_free;
>
Comment explaining why this needs checking goes here.
> + if (unlikely(current->signal->exec_bprm)) {
> + mutex_unlock(¤t->signal->cred_guard_mutex);
> + rv = -ERESTARTNOINTR;
> + goto out_free;
> + }
> +
> rv = security_setprocattr(PROC_I(inode)->op.lsmid,
> file->f_path.dentry->d_name.name, page,
> count);
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index a102a10f833f..fb0361911489 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -153,6 +153,7 @@ extern const struct cred *get_task_cred(struct task_struct *);
> extern struct cred *cred_alloc_blank(void);
> extern struct cred *prepare_creds(void);
> extern struct cred *prepare_exec_creds(void);
> +extern bool is_dumpability_changed(const struct cred *, const struct cred *);
> extern int commit_creds(struct cred *);
> extern void abort_creds(struct cred *);
> extern struct cred *prepare_kernel_cred(struct task_struct *);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 1ef1edbaaf79..3c47d8b55863 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -237,9 +237,27 @@ struct signal_struct {
> struct mm_struct *oom_mm; /* recorded mm when the thread group got
> * killed by the oom killer */
>
> + struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access
> + * against new credentials while
> + * de_thread is waiting for other
> + * traced threads to terminate.
> + * Set while de_thread is executing.
> + * The cred_guard_mutex is released
> + * after de_thread() has called
> + * zap_other_threads(), therefore
> + * a fatal signal is guaranteed to be
> + * already pending in the unlikely
> + * event, that
> + * current->signal->exec_bprm happens
> + * to be non-zero after the
> + * cred_guard_mutex was acquired.
> + */
> +
> struct mutex cred_guard_mutex; /* guard against foreign influences on
> * credential calculations
> * (notably. ptrace)
> + * Held while execve runs, except when
> + * a sibling thread is being traced.
> * Deprecated do not use in new code.
> * Use exec_update_lock instead.
> */
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 9676965c0981..0b2822c762df 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -375,6 +375,30 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
> return false;
> }
>
> +/**
> + * is_dumpability_changed - Will changing creds affect dumpability?
> + * @old: The old credentials.
> + * @new: The new credentials.
> + *
> + * If the @new credentials have no elevated privileges compared to the
> + * @old credentials, the task may remain dumpable. Otherwise we have
> + * to mark the task as undumpable to avoid information leaks from higher
> + * to lower privilege domains.
> + *
> + * Return: True if the task will become undumpable.
> + */
> +bool is_dumpability_changed(const struct cred *old, const struct cred *new)
> +{
> + if (!uid_eq(old->euid, new->euid) ||
> + !gid_eq(old->egid, new->egid) ||
> + !uid_eq(old->fsuid, new->fsuid) ||
> + !gid_eq(old->fsgid, new->fsgid) ||
> + !cred_cap_issubset(old, new))
> + return true;
> +
> + return false;
> +}
> +
> /**
> * commit_creds - Install new credentials upon the current task
> * @new: The credentials to be assigned
> @@ -403,11 +427,7 @@ int commit_creds(struct cred *new)
> get_cred(new); /* we will require a ref for the subj creds too */
>
> /* dumpability changes */
> - if (!uid_eq(old->euid, new->euid) ||
> - !gid_eq(old->egid, new->egid) ||
> - !uid_eq(old->fsuid, new->fsuid) ||
> - !gid_eq(old->fsgid, new->fsgid) ||
> - !cred_cap_issubset(old, new)) {
> + if (is_dumpability_changed(old, new)) {
> if (task->mm)
> set_dumpable(task->mm, suid_dumpable);
> task->pdeath_signal = 0;
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 75a84efad40f..230298817dbf 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -20,6 +20,7 @@
> #include <linux/pagemap.h>
> #include <linux/ptrace.h>
> #include <linux/security.h>
> +#include <linux/binfmts.h>
> #include <linux/signal.h>
> #include <linux/uio.h>
> #include <linux/audit.h>
> @@ -453,6 +454,28 @@ static int ptrace_attach(struct task_struct *task, long request,
> return retval;
> }
>
> + if (unlikely(task == task->signal->group_exec_task)) {
> + retval = down_write_killable(&task->signal->exec_update_lock);
> + if (retval)
> + return retval;
This could be written like:
ACQUIRE(rwsem_write_kill, guard)(&task->signal->exec_update_lock);
retval = ACQUIRE_ERR(rwsem_write_kill, guard);
if (retval)
return retval;
> +
> + scoped_guard (task_lock, task) {
> + struct linux_binprm *bprm = task->signal->exec_bprm;
> + const struct cred __rcu *old_cred = task->real_cred;
> + struct mm_struct *old_mm = task->mm;
> +
> + rcu_assign_pointer(task->real_cred, bprm->cred);
> + task->mm = bprm->mm;
> + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
> + rcu_assign_pointer(task->real_cred, old_cred);
> + task->mm = old_mm;
> + }
> +
> + up_write(&task->signal->exec_update_lock);
And then this goes away ^
> + if (retval)
> + return retval;
> + }
> +
> scoped_guard (write_lock_irq, &tasklist_lock) {
> if (unlikely(task->exit_state))
> return -EPERM;
> @@ -488,6 +511,14 @@ static int ptrace_traceme(void)
> {
> int ret = -EPERM;
>
This needs comments.
> + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
> + return -ERESTARTNOINTR;
> +
> + if (unlikely(current->signal->exec_bprm)) {
> + mutex_unlock(¤t->signal->cred_guard_mutex);
> + return -ERESTARTNOINTR;
> + }
#2
> +
> write_lock_irq(&tasklist_lock);
> /* Are we already being traced? */
> if (!current->ptrace) {
> @@ -503,6 +534,7 @@ static int ptrace_traceme(void)
> }
> }
> write_unlock_irq(&tasklist_lock);
> + mutex_unlock(¤t->signal->cred_guard_mutex);
>
> return ret;
> }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 41aa761c7738..d61fc275235a 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1994,9 +1994,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
> * Make sure we cannot change seccomp or nnp state via TSYNC
> * while another thread is in the middle of calling exec.
> */
> - if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
> - mutex_lock_killable(¤t->signal->cred_guard_mutex))
> - goto out_put_fd;
> + if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
> + if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
> + goto out_put_fd;
> +
> + if (unlikely(current->signal->exec_bprm)) {
> + mutex_unlock(¤t->signal->cred_guard_mutex);
> + goto out_put_fd;
> + }
#3, and after typing this same pattern 3 times, you didn't think it
needed a helper function ?
> + }
>
> spin_lock_irq(¤t->sighand->siglock);
>
More information about the Linux-security-module-archive
mailing list