[PATCH v7 5/9] pidfs, coredump: add PIDFD_INFO_COREDUMP

Alexander Mikhalitsyn alexander at mihalicyn.com
Thu May 15 14:08:13 UTC 2025


Am Do., 15. Mai 2025 um 00:04 Uhr schrieb Christian Brauner
<brauner at kernel.org>:
>
> Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP
> mask flag. This adds the fields @coredump_mask and @coredump_cookie to
> struct pidfd_info.
>
> When a task coredumps the kernel will provide the following information
> to userspace in @coredump_mask:
>
> * PIDFD_COREDUMPED is raised if the task did actually coredump.
> * PIDFD_COREDUMP_SKIP is raised if the task skipped coredumping (e.g.,
>   undumpable).
> * PIDFD_COREDUMP_USER is raised if this is a regular coredump and
>   doesn't need special care by the coredump server.
> * PIDFD_COREDUMP_ROOT is raised if the generated coredump should be
>   treated as sensitive and the coredump server should restrict to the
>   generated coredump to sufficiently privileged users.
>
> If userspace uses the coredump socket to process coredumps it needs to
> be able to discern connection from the kernel from connects from
> userspace (e.g., Python generating it's own coredumps and forwarding
> them to systemd). The @coredump_cookie extension uses the SO_COOKIE of
> the new connection. This allows userspace to validate that the
> connection has been made from the kernel by a crashing task:
>
>    fd_coredump = accept4(fd_socket, NULL, NULL, SOCK_CLOEXEC);
>    getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, &fd_peer_pidfd_len);
>
>    struct pidfd_info info = {
>            info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP,
>    };
>
>    ioctl(pidfd, PIDFD_GET_INFO, &info);
>    /* Refuse connections that aren't from a crashing task. */
>    if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED) )
>            close(fd_coredump);
>
>    /*
>     * Make sure that the coredump cookie matches the connection cookie.
>     * If they don't it's not the coredump connection from the kernel.
>     * We'll get another connection request in a bit.
>     */
>    getsocketop(fd_coredump, SOL_SOCKET, SO_COOKIE, &peer_cookie, &peer_cookie_len);
>    if (!info.coredump_cookie || (info.coredump_cookie != peer_cookie))
>            close(fd_coredump);
>
> The kernel guarantees that by the time the connection is made the all
> PIDFD_INFO_COREDUMP info is available.
>
> Signed-off-by: Christian Brauner <brauner at kernel.org>

Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn at canonical.com>

> ---
>  fs/coredump.c              | 34 ++++++++++++++++++++
>  fs/pidfs.c                 | 79 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pidfs.h      | 10 ++++++
>  include/uapi/linux/pidfd.h | 22 +++++++++++++
>  net/unix/af_unix.c         |  7 ++++
>  5 files changed, 152 insertions(+)
>
> diff --git a/fs/coredump.c b/fs/coredump.c
> index e1256ebb89c1..bfc4a32f737c 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -46,7 +46,9 @@
>  #include <linux/pidfs.h>
>  #include <linux/net.h>
>  #include <linux/socket.h>
> +#include <net/af_unix.h>
>  #include <net/net_namespace.h>
> +#include <net/sock.h>
>  #include <uapi/linux/pidfd.h>
>  #include <uapi/linux/un.h>
>
> @@ -598,6 +600,8 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
>                 if (IS_ERR(pidfs_file))
>                         return PTR_ERR(pidfs_file);
>
> +               pidfs_coredump(cp);
> +
>                 /*
>                  * Usermode helpers are childen of either
>                  * system_unbound_wq or of kthreadd. So we know that
> @@ -876,8 +880,34 @@ void do_coredump(const kernel_siginfo_t *siginfo)
>                         goto close_fail;
>                 }
>
> +               /*
> +                * Set the thread-group leader pid which is used for the
> +                * peer credentials during connect() below. Then
> +                * immediately register it in pidfs...
> +                */
> +               cprm.pid = task_tgid(current);
> +               retval = pidfs_register_pid(cprm.pid);
> +               if (retval) {
> +                       sock_release(socket);
> +                       goto close_fail;
> +               }
> +
> +               /*
> +                * ... and set the coredump information so userspace
> +                * has it available after connect()...
> +                */
> +               pidfs_coredump(&cprm);
> +
> +               /*
> +                * ... On connect() the peer credentials are recorded
> +                * and @cprm.pid registered in pidfs...
> +                */
>                 retval = kernel_connect(socket, (struct sockaddr *)(&addr),
>                                         addr_len, O_NONBLOCK | SOCK_COREDUMP);
> +
> +               /* ... So we can safely put our pidfs reference now... */
> +               pidfs_put_pid(cprm.pid);
> +
>                 if (retval) {
>                         if (retval == -EAGAIN)
>                                 coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
> @@ -886,6 +916,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
>                         goto close_fail;
>                 }
>
> +               /* ... and validate that @sk_peer_pid matches @cprm.pid. */
> +               if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
> +                       goto close_fail;
> +
>                 cprm.limit = RLIM_INFINITY;
>                 cprm.file = no_free_ptr(file);
>  #else
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 3b39e471840b..d7b9a0dd2db6 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -20,6 +20,7 @@
>  #include <linux/time_namespace.h>
>  #include <linux/utsname.h>
>  #include <net/net_namespace.h>
> +#include <linux/coredump.h>
>
>  #include "internal.h"
>  #include "mount.h"
> @@ -33,6 +34,8 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
>  struct pidfs_exit_info {
>         __u64 cgroupid;
>         __s32 exit_code;
> +       __u32 coredump_mask;
> +       __u64 coredump_cookie;
>  };
>
>  struct pidfs_inode {
> @@ -240,6 +243,22 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
>         return false;
>  }
>
> +static __u32 pidfs_coredump_mask(unsigned long mm_flags)
> +{
> +       switch (__get_dumpable(mm_flags)) {
> +       case SUID_DUMP_USER:
> +               return PIDFD_COREDUMP_USER;
> +       case SUID_DUMP_ROOT:
> +               return PIDFD_COREDUMP_ROOT;
> +       case SUID_DUMP_DISABLE:
> +               return PIDFD_COREDUMP_SKIP;
> +       default:
> +               WARN_ON_ONCE(true);
> +       }
> +
> +       return 0;
> +}
> +
>  static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>  {
>         struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
> @@ -280,6 +299,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>                 }
>         }
>
> +       if (mask & PIDFD_INFO_COREDUMP) {
> +               kinfo.mask |= PIDFD_INFO_COREDUMP;
> +               smp_rmb();
> +               kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> +               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
> +       }
> +
>         task = get_pid_task(pid, PIDTYPE_PID);
>         if (!task) {
>                 /*
> @@ -296,6 +322,16 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>         if (!c)
>                 return -ESRCH;
>
> +       if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
> +               task_lock(task);
> +               if (task->mm) {
> +                       smp_rmb();
> +                       kinfo.coredump_cookie = READ_ONCE(pidfs_i(inode)->__pei.coredump_cookie);
> +                       kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
> +               }
> +               task_unlock(task);
> +       }
> +
>         /* Unconditionally return identifiers and credentials, the rest only on request */
>
>         user_ns = current_user_ns();
> @@ -559,6 +595,49 @@ void pidfs_exit(struct task_struct *tsk)
>         }
>  }
>
> +#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
> +void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie)
> +{
> +       struct pidfs_exit_info *exit_info;
> +       struct dentry *dentry = pid->stashed;
> +       struct inode *inode;
> +
> +       if (WARN_ON_ONCE(!dentry))
> +               return;
> +
> +       inode = d_inode(dentry);
> +       exit_info = &pidfs_i(inode)->__pei;
> +       /* Can't use smp_store_release() because of 32bit. */
> +       smp_wmb();
> +       WRITE_ONCE(exit_info->coredump_cookie, coredump_cookie);
> +}
> +#endif
> +
> +#ifdef CONFIG_COREDUMP
> +void pidfs_coredump(const struct coredump_params *cprm)
> +{
> +       struct pid *pid = cprm->pid;
> +       struct pidfs_exit_info *exit_info;
> +       struct dentry *dentry;
> +       struct inode *inode;
> +       __u32 coredump_mask = 0;
> +
> +       dentry = pid->stashed;
> +       if (WARN_ON_ONCE(!dentry))
> +               return;
> +
> +       inode = d_inode(dentry);
> +       exit_info = &pidfs_i(inode)->__pei;
> +       /* Note how we were coredumped. */
> +       coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
> +       /* Note that we actually did coredump. */
> +       coredump_mask |= PIDFD_COREDUMPED;
> +       /* If coredumping is set to skip we should never end up here. */
> +       VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
> +       smp_store_release(&exit_info->coredump_mask, coredump_mask);
> +}
> +#endif
> +
>  static struct vfsmount *pidfs_mnt __ro_after_init;
>
>  /*
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 2676890c4d0d..497997bc5e34 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -2,11 +2,21 @@
>  #ifndef _LINUX_PID_FS_H
>  #define _LINUX_PID_FS_H
>
> +struct coredump_params;
> +
>  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
>  void __init pidfs_init(void);
>  void pidfs_add_pid(struct pid *pid);
>  void pidfs_remove_pid(struct pid *pid);
>  void pidfs_exit(struct task_struct *tsk);
> +#ifdef CONFIG_COREDUMP
> +void pidfs_coredump(const struct coredump_params *cprm);
> +#endif
> +#if defined(CONFIG_COREDUMP) && defined(CONFIG_UNIX)
> +void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie);
> +#elif defined(CONFIG_UNIX)
> +static inline void pidfs_coredump_cookie(struct pid *pid, u64 coredump_cookie) { }
> +#endif
>  extern const struct dentry_operations pidfs_dentry_operations;
>  int pidfs_register_pid(struct pid *pid);
>  void pidfs_get_pid(struct pid *pid);
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 8c1511edd0e9..69267c5ae6d0 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -25,9 +25,28 @@
>  #define PIDFD_INFO_CREDS               (1UL << 1) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CGROUPID            (1UL << 2) /* Always returned if available, even if not requested */
>  #define PIDFD_INFO_EXIT                        (1UL << 3) /* Only returned if requested. */
> +#define PIDFD_INFO_COREDUMP            (1UL << 4) /* Only returned if requested. */
>
>  #define PIDFD_INFO_SIZE_VER0           64 /* sizeof first published struct */
>
> +/*
> + * Values for @coredump_mask in pidfd_info.
> + * Only valid if PIDFD_INFO_COREDUMP is set in @mask.
> + *
> + * Note, the @PIDFD_COREDUMP_ROOT flag indicates that the generated
> + * coredump should be treated as sensitive and access should only be
> + * granted to privileged users.
> + *
> + * If the coredump AF_UNIX socket is used for processing coredumps
> + * @coredump_cookie will be set to the socket SO_COOKIE of the receivers
> + * client socket. This allows the coredump handler to detect whether an
> + * incoming coredump connection was initiated from the crashing task.
> + */
> +#define PIDFD_COREDUMPED       (1U << 0) /* Did crash and... */
> +#define PIDFD_COREDUMP_SKIP    (1U << 1) /* coredumping generation was skipped. */
> +#define PIDFD_COREDUMP_USER    (1U << 2) /* coredump was done as the user. */
> +#define PIDFD_COREDUMP_ROOT    (1U << 3) /* coredump was done as root. */
> +
>  /*
>   * The concept of process and threads in userland and the kernel is a confusing
>   * one - within the kernel every thread is a 'task' with its own individual PID,
> @@ -92,6 +111,9 @@ struct pidfd_info {
>         __u32 fsuid;
>         __u32 fsgid;
>         __s32 exit_code;
> +       __u32 coredump_mask;
> +       __u32 __spare1;
> +       __u64 coredump_cookie;
>  };
>
>  #define PIDFS_IOCTL_MAGIC 0xFF
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index a9d1c9ba2961..053d2e48e918 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -99,6 +99,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/skbuff.h>
>  #include <linux/slab.h>
> +#include <linux/sock_diag.h>
>  #include <linux/socket.h>
>  #include <linux/splice.h>
>  #include <linux/string.h>
> @@ -742,6 +743,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
>
>  struct unix_peercred {
>         struct pid *peer_pid;
> +       u64 cookie;
>         const struct cred *peer_cred;
>  };
>
> @@ -777,6 +779,8 @@ static void drop_peercred(struct unix_peercred *peercred)
>  static inline void init_peercred(struct sock *sk,
>                                  const struct unix_peercred *peercred)
>  {
> +       if (peercred->cookie)
> +               pidfs_coredump_cookie(peercred->peer_pid, peercred->cookie);
>         sk->sk_peer_pid = peercred->peer_pid;
>         sk->sk_peer_cred = peercred->peer_cred;
>  }
> @@ -1713,6 +1717,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
>         unix_peer(newsk)        = sk;
>         newsk->sk_state         = TCP_ESTABLISHED;
>         newsk->sk_type          = sk->sk_type;
> +       /* Prepare a new socket cookie for the receiver. */
> +       if (flags & SOCK_COREDUMP)
> +               peercred.cookie = sock_gen_cookie(newsk);
>         init_peercred(newsk, &peercred);
>         newu = unix_sk(newsk);
>         newu->listener = other;
>
> --
> 2.47.2
>



More information about the Linux-security-module-archive mailing list