[PATCH v4 2/8] audit,io_uring,io-wq: add some basic audit support to io_uring
Richard Guy Briggs
rgb at redhat.com
Thu Sep 16 13:33:08 UTC 2021
On 2021-09-15 12:49, Paul Moore wrote:
> This patch adds basic auditing to io_uring operations, regardless of
> their context. This is accomplished by allocating audit_context
> structures for the io-wq worker and io_uring SQPOLL kernel threads
> as well as explicitly auditing the io_uring operations in
> io_issue_sqe(). Individual io_uring operations can bypass auditing
> through the "audit_skip" field in the struct io_op_def definition for
> the operation; although great care must be taken so that security
> relevant io_uring operations do not bypass auditing; please contact
> the audit mailing list (see the MAINTAINERS file) with any questions.
>
> The io_uring operations are audited using a new AUDIT_URINGOP record,
> an example is shown below:
>
> type=UNKNOWN[1336] msg=audit(1630523381.288:260):
> uring_op=19 success=yes exit=0 items=0 ppid=853 pid=1204
> uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
> key=(null)
> AUID="root" UID="root" GID="root" EUID="root" SUID="root"
> FSUID="root" EGID="root" SGID="root" FSGID="root"
>
> Thanks to Richard Guy Briggs for review and feedback.
I share Steve's concerns about the missing auid and ses. The userspace
log interpreter conjured up AUID="root" from the absent auid=.
Some of the creds are here including ppid, pid, a herd of *id and subj.
*Something* initiated this action and then delegated it to iouring to
carry out. That should be in there somewhere. You had a concern about
shared queues and mis-attribution. All of these creds including auid
and ses should be kept together to get this right.
> Signed-off-by: Paul Moore <paul at paul-moore.com>
>
> ---
> v4:
> - removed some work-in-progress comments
> - removed the auid logging in audit_log_uring()
> v3:
> - removed work-in-progress warning from the description
> v2:
> - added dummy funcs for audit_uring_{entry,exit}()
> - replaced opcode checks in io_issue_sqe() with audit_skip checks
> - moved fastpath checks into audit_uring_{entry,exit}()
> - audit_log_uring() uses GFP_ATOMIC
> - don't record the arch in __audit_uring_entry()
> v1:
> - initial draft
> ---
> fs/io-wq.c | 4 +
> fs/io_uring.c | 55 +++++++++++++--
> include/linux/audit.h | 26 +++++++
> include/uapi/linux/audit.h | 1
> kernel/audit.h | 2 +
> kernel/auditsc.c | 166 ++++++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 248 insertions(+), 6 deletions(-)
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 6c55362c1f99..dac5c5961c9d 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -14,6 +14,7 @@
> #include <linux/rculist_nulls.h>
> #include <linux/cpu.h>
> #include <linux/tracehook.h>
> +#include <linux/audit.h>
>
> #include "io-wq.h"
>
> @@ -562,6 +563,8 @@ static int io_wqe_worker(void *data)
> snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
> set_task_comm(current, buf);
>
> + audit_alloc_kernel(current);
> +
> while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
> long ret;
>
> @@ -601,6 +604,7 @@ static int io_wqe_worker(void *data)
> io_worker_handle_work(worker);
> }
>
> + audit_free(current);
> io_worker_exit(worker);
> return 0;
> }
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 16fb7436043c..388754b24785 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -79,6 +79,7 @@
> #include <linux/pagemap.h>
> #include <linux/io_uring.h>
> #include <linux/tracehook.h>
> +#include <linux/audit.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/io_uring.h>
> @@ -917,6 +918,8 @@ struct io_op_def {
> unsigned needs_async_setup : 1;
> /* should block plug */
> unsigned plug : 1;
> + /* skip auditing */
> + unsigned audit_skip : 1;
> /* size of async data needed, if any */
> unsigned short async_size;
> };
> @@ -930,6 +933,7 @@ static const struct io_op_def io_op_defs[] = {
> .buffer_select = 1,
> .needs_async_setup = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_WRITEV] = {
> @@ -939,16 +943,19 @@ static const struct io_op_def io_op_defs[] = {
> .pollout = 1,
> .needs_async_setup = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_FSYNC] = {
> .needs_file = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_READ_FIXED] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> .pollin = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_WRITE_FIXED] = {
> @@ -957,15 +964,20 @@ static const struct io_op_def io_op_defs[] = {
> .unbound_nonreg_file = 1,
> .pollout = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_POLL_ADD] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> + .audit_skip = 1,
> + },
> + [IORING_OP_POLL_REMOVE] = {
> + .audit_skip = 1,
> },
> - [IORING_OP_POLL_REMOVE] = {},
> [IORING_OP_SYNC_FILE_RANGE] = {
> .needs_file = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_SENDMSG] = {
> .needs_file = 1,
> @@ -983,18 +995,23 @@ static const struct io_op_def io_op_defs[] = {
> .async_size = sizeof(struct io_async_msghdr),
> },
> [IORING_OP_TIMEOUT] = {
> + .audit_skip = 1,
> .async_size = sizeof(struct io_timeout_data),
> },
> [IORING_OP_TIMEOUT_REMOVE] = {
> /* used by timeout updates' prep() */
> + .audit_skip = 1,
> },
> [IORING_OP_ACCEPT] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> .pollin = 1,
> },
> - [IORING_OP_ASYNC_CANCEL] = {},
> + [IORING_OP_ASYNC_CANCEL] = {
> + .audit_skip = 1,
> + },
> [IORING_OP_LINK_TIMEOUT] = {
> + .audit_skip = 1,
> .async_size = sizeof(struct io_timeout_data),
> },
> [IORING_OP_CONNECT] = {
> @@ -1009,14 +1026,19 @@ static const struct io_op_def io_op_defs[] = {
> },
> [IORING_OP_OPENAT] = {},
> [IORING_OP_CLOSE] = {},
> - [IORING_OP_FILES_UPDATE] = {},
> - [IORING_OP_STATX] = {},
> + [IORING_OP_FILES_UPDATE] = {
> + .audit_skip = 1,
> + },
> + [IORING_OP_STATX] = {
> + .audit_skip = 1,
> + },
> [IORING_OP_READ] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> .pollin = 1,
> .buffer_select = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_WRITE] = {
> @@ -1025,39 +1047,50 @@ static const struct io_op_def io_op_defs[] = {
> .unbound_nonreg_file = 1,
> .pollout = 1,
> .plug = 1,
> + .audit_skip = 1,
> .async_size = sizeof(struct io_async_rw),
> },
> [IORING_OP_FADVISE] = {
> .needs_file = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_MADVISE] = {},
> [IORING_OP_SEND] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> .pollout = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_RECV] = {
> .needs_file = 1,
> .unbound_nonreg_file = 1,
> .pollin = 1,
> .buffer_select = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_OPENAT2] = {
> },
> [IORING_OP_EPOLL_CTL] = {
> .unbound_nonreg_file = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_SPLICE] = {
> .needs_file = 1,
> .hash_reg_file = 1,
> .unbound_nonreg_file = 1,
> + .audit_skip = 1,
> + },
> + [IORING_OP_PROVIDE_BUFFERS] = {
> + .audit_skip = 1,
> + },
> + [IORING_OP_REMOVE_BUFFERS] = {
> + .audit_skip = 1,
> },
> - [IORING_OP_PROVIDE_BUFFERS] = {},
> - [IORING_OP_REMOVE_BUFFERS] = {},
> [IORING_OP_TEE] = {
> .needs_file = 1,
> .hash_reg_file = 1,
> .unbound_nonreg_file = 1,
> + .audit_skip = 1,
> },
> [IORING_OP_SHUTDOWN] = {
> .needs_file = 1,
> @@ -6591,6 +6624,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
> if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
> creds = override_creds(req->creds);
>
> + if (!io_op_defs[req->opcode].audit_skip)
> + audit_uring_entry(req->opcode);
> +
> switch (req->opcode) {
> case IORING_OP_NOP:
> ret = io_nop(req, issue_flags);
> @@ -6706,6 +6742,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
> break;
> }
>
> + if (!io_op_defs[req->opcode].audit_skip)
> + audit_uring_exit(!ret, ret);
> +
> if (creds)
> revert_creds(creds);
> if (ret)
> @@ -7360,6 +7399,8 @@ static int io_sq_thread(void *data)
> set_cpus_allowed_ptr(current, cpu_online_mask);
> current->flags |= PF_NO_SETAFFINITY;
>
> + audit_alloc_kernel(current);
> +
> mutex_lock(&sqd->lock);
> while (1) {
> bool cap_entries, sqt_spin = false;
> @@ -7425,6 +7466,8 @@ static int io_sq_thread(void *data)
> io_run_task_work();
> mutex_unlock(&sqd->lock);
>
> + audit_free(current);
> +
> complete(&sqd->exited);
> do_exit(0);
> }
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 82b7c1116a85..d656a06dd909 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -286,7 +286,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
> /* These are defined in auditsc.c */
> /* Public API */
> extern int audit_alloc(struct task_struct *task);
> +extern int audit_alloc_kernel(struct task_struct *task);
> extern void __audit_free(struct task_struct *task);
> +extern void __audit_uring_entry(u8 op);
> +extern void __audit_uring_exit(int success, long code);
> extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
> unsigned long a2, unsigned long a3);
> extern void __audit_syscall_exit(int ret_success, long ret_value);
> @@ -323,6 +326,21 @@ static inline void audit_free(struct task_struct *task)
> if (unlikely(task->audit_context))
> __audit_free(task);
> }
> +static inline void audit_uring_entry(u8 op)
> +{
> + /*
> + * We intentionally check audit_context() before audit_enabled as most
> + * Linux systems (as of ~2021) rely on systemd which forces audit to
> + * be enabled regardless of the user's audit configuration.
> + */
> + if (unlikely(audit_context() && audit_enabled))
> + __audit_uring_entry(op);
> +}
> +static inline void audit_uring_exit(int success, long code)
> +{
> + if (unlikely(!audit_dummy_context()))
> + __audit_uring_exit(success, code);
> +}
> static inline void audit_syscall_entry(int major, unsigned long a0,
> unsigned long a1, unsigned long a2,
> unsigned long a3)
> @@ -554,8 +572,16 @@ static inline int audit_alloc(struct task_struct *task)
> {
> return 0;
> }
> +static inline int audit_alloc_kernel(struct task_struct *task)
> +{
> + return 0;
> +}
> static inline void audit_free(struct task_struct *task)
> { }
> +static inline void audit_uring_entry(u8 op)
> +{ }
> +static inline void audit_uring_exit(int success, long code)
> +{ }
> static inline void audit_syscall_entry(int major, unsigned long a0,
> unsigned long a1, unsigned long a2,
> unsigned long a3)
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index daa481729e9b..a1997697c8b1 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -118,6 +118,7 @@
> #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */
> #define AUDIT_BPF 1334 /* BPF subsystem */
> #define AUDIT_EVENT_LISTENER 1335 /* Task joined multicast read socket */
> +#define AUDIT_URINGOP 1336 /* io_uring operation */
>
> #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */
> #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 13abc48de0bd..d1161e3b83e2 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -103,10 +103,12 @@ struct audit_context {
> enum {
> AUDIT_CTX_UNUSED, /* audit_context is currently unused */
> AUDIT_CTX_SYSCALL, /* in use by syscall */
> + AUDIT_CTX_URING, /* in use by io_uring */
> } context;
> enum audit_state state, current_state;
> unsigned int serial; /* serial number for record */
> int major; /* syscall number */
> + int uring_op; /* uring operation */
> struct timespec64 ctime; /* time of syscall entry */
> unsigned long argv[4]; /* syscall arguments */
> long return_code;/* syscall return code */
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index f3d309b05c2d..6dda448fb826 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -959,6 +959,7 @@ static void audit_reset_context(struct audit_context *ctx)
> ctx->current_state = ctx->state;
> ctx->serial = 0;
> ctx->major = 0;
> + ctx->uring_op = 0;
> ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
> memset(ctx->argv, 0, sizeof(ctx->argv));
> ctx->return_code = 0;
> @@ -1044,6 +1045,31 @@ int audit_alloc(struct task_struct *tsk)
> return 0;
> }
>
> +/**
> + * audit_alloc_kernel - allocate an audit_context for a kernel task
> + * @tsk: the kernel task
> + *
> + * Similar to the audit_alloc() function, but intended for kernel private
> + * threads. Returns zero on success, negative values on failure.
> + */
> +int audit_alloc_kernel(struct task_struct *tsk)
> +{
> + /*
> + * At the moment we are just going to call into audit_alloc() to
> + * simplify the code, but there two things to keep in mind with this
> + * approach:
> + *
> + * 1. Filtering internal kernel tasks is a bit laughable in almost all
> + * cases, but there is at least one case where there is a benefit:
> + * the '-a task,never' case allows the admin to effectively disable
> + * task auditing at runtime.
> + *
> + * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
> + * on these internal kernel tasks, but they probably don't hurt either.
> + */
> + return audit_alloc(tsk);
> +}
> +
> static inline void audit_free_context(struct audit_context *context)
> {
> /* resetting is extra work, but it is likely just noise */
> @@ -1546,6 +1572,44 @@ static void audit_log_proctitle(void)
> audit_log_end(ab);
> }
>
> +/**
> + * audit_log_uring - generate a AUDIT_URINGOP record
> + * @ctx: the audit context
> + */
> +static void audit_log_uring(struct audit_context *ctx)
> +{
> + struct audit_buffer *ab;
> + const struct cred *cred;
> +
> + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
> + if (!ab)
> + return;
> + cred = current_cred();
> + audit_log_format(ab, "uring_op=%d", ctx->uring_op);
> + if (ctx->return_valid != AUDITSC_INVALID)
> + audit_log_format(ab, " success=%s exit=%ld",
> + (ctx->return_valid == AUDITSC_SUCCESS ?
> + "yes" : "no"),
> + ctx->return_code);
> + audit_log_format(ab,
> + " items=%d"
> + " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
> + " fsuid=%u egid=%u sgid=%u fsgid=%u",
> + ctx->name_count,
> + task_ppid_nr(current), task_tgid_nr(current),
> + from_kuid(&init_user_ns, cred->uid),
> + from_kgid(&init_user_ns, cred->gid),
> + from_kuid(&init_user_ns, cred->euid),
> + from_kuid(&init_user_ns, cred->suid),
> + from_kuid(&init_user_ns, cred->fsuid),
> + from_kgid(&init_user_ns, cred->egid),
> + from_kgid(&init_user_ns, cred->sgid),
> + from_kgid(&init_user_ns, cred->fsgid));
> + audit_log_task_context(ab);
> + audit_log_key(ab, ctx->filterkey);
> + audit_log_end(ab);
> +}
> +
> static void audit_log_exit(void)
> {
> int i, call_panic = 0;
> @@ -1581,6 +1645,9 @@ static void audit_log_exit(void)
> audit_log_key(ab, context->filterkey);
> audit_log_end(ab);
> break;
> + case AUDIT_CTX_URING:
> + audit_log_uring(context);
> + break;
> default:
> BUG();
> break;
> @@ -1751,6 +1818,105 @@ static void audit_return_fixup(struct audit_context *ctx,
> ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
> }
>
> +/**
> + * __audit_uring_entry - prepare the kernel task's audit context for io_uring
> + * @op: the io_uring opcode
> + *
> + * This is similar to audit_syscall_entry() but is intended for use by io_uring
> + * operations. This function should only ever be called from
> + * audit_uring_entry() as we rely on the audit context checking present in that
> + * function.
> + */
> +void __audit_uring_entry(u8 op)
> +{
> + struct audit_context *ctx = audit_context();
> +
> + if (ctx->state == AUDIT_STATE_DISABLED)
> + return;
> +
> + /*
> + * NOTE: It's possible that we can be called from the process' context
> + * before it returns to userspace, and before audit_syscall_exit()
> + * is called. In this case there is not much to do, just record
> + * the io_uring details and return.
> + */
> + ctx->uring_op = op;
> + if (ctx->context == AUDIT_CTX_SYSCALL)
> + return;
> +
> + ctx->dummy = !audit_n_rules;
> + if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
> + ctx->prio = 0;
> +
> + ctx->context = AUDIT_CTX_URING;
> + ctx->current_state = ctx->state;
> + ktime_get_coarse_real_ts64(&ctx->ctime);
> +}
> +
> +/**
> + * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
> + * @success: true/false value to indicate if the operation succeeded or not
> + * @code: operation return code
> + *
> + * This is similar to audit_syscall_exit() but is intended for use by io_uring
> + * operations. This function should only ever be called from
> + * audit_uring_exit() as we rely on the audit context checking present in that
> + * function.
> + */
> +void __audit_uring_exit(int success, long code)
> +{
> + struct audit_context *ctx = audit_context();
> +
> + /*
> + * TODO: At some point we will likely want to filter on io_uring ops
> + * and other things similar to what we do for syscalls, but that
> + * is something for another day; just record what we can here.
> + */
> +
> + if (ctx->context == AUDIT_CTX_SYSCALL) {
> + /*
> + * NOTE: See the note in __audit_uring_entry() about the case
> + * where we may be called from process context before we
> + * return to userspace via audit_syscall_exit(). In this
> + * case we simply emit a URINGOP record and bail, the
> + * normal syscall exit handling will take care of
> + * everything else.
> + * It is also worth mentioning that when we are called,
> + * the current process creds may differ from the creds
> + * used during the normal syscall processing; keep that
> + * in mind if/when we move the record generation code.
> + */
> +
> + /*
> + * We need to filter on the syscall info here to decide if we
> + * should emit a URINGOP record. I know it seems odd but this
> + * solves the problem where users have a filter to block *all*
> + * syscall records in the "exit" filter; we want to preserve
> + * the behavior here.
> + */
> + audit_filter_syscall(current, ctx);
> + audit_filter_inodes(current, ctx);
> + if (ctx->current_state != AUDIT_STATE_RECORD)
> + return;
> +
> + audit_log_uring(ctx);
> + return;
> + }
> +
> + /* this may generate CONFIG_CHANGE records */
> + if (!list_empty(&ctx->killed_trees))
> + audit_kill_trees(ctx);
> +
> + audit_filter_inodes(current, ctx);
> + if (ctx->current_state != AUDIT_STATE_RECORD)
> + goto out;
> + audit_return_fixup(ctx, success, code);
> + audit_log_exit();
> +
> +out:
> + audit_reset_context(ctx);
> +}
> +
> /**
> * __audit_syscall_entry - fill in an audit record at syscall entry
> * @major: major syscall type (function)
>
- RGB
--
Richard Guy Briggs <rgb at redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
More information about the Linux-security-module-archive
mailing list