[PATCH] userfaultfd, capability: introduce CAP_USERFAULTFD

Casey Schaufler casey at schaufler-ca.com
Thu Feb 24 19:12:44 UTC 2022


On 2/24/2022 10:19 AM, Axel Rasmussen wrote:
> Historically, it has been shown that intercepting kernel faults with
> userfaultfd (thereby forcing the kernel to wait for an arbitrary amount
> of time) can be exploited, or at least can make some kinds of exploits
> easier. So, in 37cd0575b8 "userfaultfd: add UFFD_USER_MODE_ONLY" we
> changed things so, in order for kernel faults to be handled by
> userfaultfd, either the process needs CAP_SYS_PTRACE, or this sysctl
> must be configured so that any unprivileged user can do it.
>
> In a typical implementation of a hypervisor with live migration (take
> QEMU/KVM as one such example), we do indeed need to be able to handle
> kernel faults. But, both options above are less than ideal:
>
> - Toggling the sysctl increases attack surface by allowing any
>    unprivileged user to do it.
>
> - Granting the live migration process CAP_SYS_PTRACE gives it this
>    ability, but *also* the ability to "observe and control the
>    execution of another process [...], and examine and change [its]
>    memory and registers" (from ptrace(2)). This isn't something we need
>    or want to be able to do, so granting this permission violates the
>    "principle of least privilege".
>
> This is all a long winded way to say: we want a more fine-grained way to
> grant access to userfaultfd, without granting other additional
> permissions at the same time.
>
> So, add CAP_USERFAULTFD, for this specific case.

TL;DR - No. We don't add new capabilities for a single use.

You have a program that is already using a reasonably restrictive
capability (compared to CAP_SYS_ADMIN, for example) and which I
assume you have implemented appropriately for the level of privilege
used. If you can demonstrate that this CAP_USERFAULTD has applicability
beyond your specific implementation (and the name would imply otherwise)
it could be worth considering, but as it is, no.

>
> Setup a helper which accepts either CAP_USERFAULTFD, or for backward
> compatibility reasons (existing userspaces may depend on the old way of
> doing things), CAP_SYS_PTRACE.
>
> One special case is UFFD_FEATURE_EVENT_FORK: this is left requiring only
> CAP_SYS_PTRACE, since it is specifically about manipulating the memory
> of another (child) process, it sems like a better fit the way it is. To
> my knowledge, this isn't a feature required by typical live migration
> implementations, so this doesn't obviate the above.
>
> Signed-off-by: Axel Rasmussen <axelrasmussen at google.com>
> ---
>   fs/userfaultfd.c                    | 6 +++---
>   include/linux/capability.h          | 5 +++++
>   include/uapi/linux/capability.h     | 7 ++++++-
>   security/selinux/include/classmap.h | 4 ++--
>   4 files changed, 16 insertions(+), 6 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index e26b10132d47..1ec0d9b49a70 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -411,7 +411,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
>   	    ctx->flags & UFFD_USER_MODE_ONLY) {
>   		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
>   			"sysctl knob to 1 if kernel faults must be handled "
> -			"without obtaining CAP_SYS_PTRACE capability\n");
> +			"without obtaining CAP_USERFAULTFD capability\n");
>   		goto out;
>   	}
>   
> @@ -2068,10 +2068,10 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
>   
>   	if (!sysctl_unprivileged_userfaultfd &&
>   	    (flags & UFFD_USER_MODE_ONLY) == 0 &&
> -	    !capable(CAP_SYS_PTRACE)) {
> +	    !userfaultfd_capable()) {
>   		printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
>   			"sysctl knob to 1 if kernel faults must be handled "
> -			"without obtaining CAP_SYS_PTRACE capability\n");
> +			"without obtaining CAP_USERFAULTFD capability\n");
>   		return -EPERM;
>   	}
>   
> diff --git a/include/linux/capability.h b/include/linux/capability.h
> index 65efb74c3585..f1e7b3506432 100644
> --- a/include/linux/capability.h
> +++ b/include/linux/capability.h
> @@ -270,6 +270,11 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
>   		ns_capable(ns, CAP_SYS_ADMIN);
>   }
>   
> +static inline bool userfaultfd_capable(void)
> +{
> +	return capable(CAP_USERFAULTFD) || capable(CAP_SYS_PTRACE);
> +}
> +
>   /* audit system wants to get cap info from files as well */
>   int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
>   			   const struct dentry *dentry,
> diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
> index 463d1ba2232a..83a5d8601508 100644
> --- a/include/uapi/linux/capability.h
> +++ b/include/uapi/linux/capability.h
> @@ -231,6 +231,7 @@ struct vfs_ns_cap_data {
>   #define CAP_SYS_CHROOT       18
>   
>   /* Allow ptrace() of any process */
> +/* Allow everything under CAP_USERFAULTFD for backward compatibility */
>   
>   #define CAP_SYS_PTRACE       19
>   
> @@ -417,7 +418,11 @@ struct vfs_ns_cap_data {
>   
>   #define CAP_CHECKPOINT_RESTORE	40
>   
> -#define CAP_LAST_CAP         CAP_CHECKPOINT_RESTORE
> +/* Allow intercepting kernel faults with userfaultfd */
> +
> +#define CAP_USERFAULTFD		41
> +
> +#define CAP_LAST_CAP         CAP_USERFAULTFD
>   
>   #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
>   
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index 35aac62a662e..98e37b220159 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -28,9 +28,9 @@
>   
>   #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
>   		"wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf", \
> -		"checkpoint_restore"
> +		"checkpoint_restore", "userfaultfd"
>   
> -#if CAP_LAST_CAP > CAP_CHECKPOINT_RESTORE
> +#if CAP_LAST_CAP > CAP_USERFAULTFD
>   #error New capability defined, please update COMMON_CAP2_PERMS.
>   #endif
>   



More information about the Linux-security-module-archive mailing list