[RFC PATCH 2/9] audit, io_uring, io-wq: add some basic audit support to io_uring
Richard Guy Briggs
rgb at redhat.com
Wed Jun 2 17:29:24 UTC 2021
On 2021-05-21 17:49, Paul Moore wrote:
> WARNING - This is a work in progress and should not be merged
> anywhere important. It is almost surely not complete, and while it
> probably compiles it likely hasn't been booted and will do terrible
> things. You have been warned.
>
> This patch adds basic auditing to io_uring operations, regardless of
> their context. This is accomplished by allocating audit_context
> structures for the io-wq worker and io_uring SQPOLL kernel threads
> as well as explicitly auditing the io_uring operations in
> io_issue_sqe(). The io_uring operations are audited using a new
> AUDIT_URINGOP record, an example is shown below:
>
> % <TODO - insert AUDIT_URINGOP record example>
>
> Thanks to Richard Guy Briggs for review and feedback.
>
> Signed-off-by: Paul Moore <paul at paul-moore.com>
> ---
> fs/io-wq.c | 4 +
> fs/io_uring.c | 11 +++
> include/linux/audit.h | 17 ++++
> include/uapi/linux/audit.h | 1
> kernel/audit.h | 2 +
> kernel/auditsc.c | 173 ++++++++++++++++++++++++++++++++++++++++++++
> 6 files changed, 208 insertions(+)
>
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 5361a9b4b47b..8af09a3336e0 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -16,6 +16,7 @@
> #include <linux/rculist_nulls.h>
> #include <linux/cpu.h>
> #include <linux/tracehook.h>
> +#include <linux/audit.h>
>
> #include "io-wq.h"
>
> @@ -535,6 +536,8 @@ static int io_wqe_worker(void *data)
> snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
> set_task_comm(current, buf);
>
> + audit_alloc_kernel(current);
> +
> while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
> long ret;
>
> @@ -573,6 +576,7 @@ static int io_wqe_worker(void *data)
> raw_spin_unlock_irq(&wqe->lock);
> }
>
> + audit_free(current);
> io_worker_exit(worker);
> return 0;
> }
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index e481ac8a757a..e9941d1ad8fd 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -78,6 +78,7 @@
> #include <linux/task_work.h>
> #include <linux/pagemap.h>
> #include <linux/io_uring.h>
> +#include <linux/audit.h>
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/io_uring.h>
> @@ -6105,6 +6106,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
> if (req->work.creds && req->work.creds != current_cred())
> creds = override_creds(req->work.creds);
>
> + if (req->opcode < IORING_OP_LAST)
> + audit_uring_entry(req->opcode);
> +
> switch (req->opcode) {
> case IORING_OP_NOP:
> ret = io_nop(req, issue_flags);
> @@ -6211,6 +6215,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
> break;
> }
>
> + if (req->opcode < IORING_OP_LAST)
> + audit_uring_exit(!ret, ret);
> +
> if (creds)
> revert_creds(creds);
>
> @@ -6827,6 +6834,8 @@ static int io_sq_thread(void *data)
> set_cpus_allowed_ptr(current, cpu_online_mask);
> current->flags |= PF_NO_SETAFFINITY;
>
> + audit_alloc_kernel(current);
> +
> mutex_lock(&sqd->lock);
> /* a user may had exited before the thread started */
> io_run_task_work_head(&sqd->park_task_work);
> @@ -6916,6 +6925,8 @@ static int io_sq_thread(void *data)
> io_run_task_work_head(&sqd->park_task_work);
> mutex_unlock(&sqd->lock);
>
> + audit_free(current);
> +
> complete(&sqd->exited);
> do_exit(0);
> }
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 82b7c1116a85..6a0c013bc7de 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -286,7 +286,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
> /* These are defined in auditsc.c */
> /* Public API */
> extern int audit_alloc(struct task_struct *task);
> +extern int audit_alloc_kernel(struct task_struct *task);
> extern void __audit_free(struct task_struct *task);
> +extern void __audit_uring_entry(u8 op);
> +extern void __audit_uring_exit(int success, long code);
> extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
> unsigned long a2, unsigned long a3);
> extern void __audit_syscall_exit(int ret_success, long ret_value);
> @@ -323,6 +326,16 @@ static inline void audit_free(struct task_struct *task)
> if (unlikely(task->audit_context))
> __audit_free(task);
> }
> +static inline void audit_uring_entry(u8 op)
> +{
> + if (unlikely(audit_context()))
> + __audit_uring_entry(op);
> +}
> +static inline void audit_uring_exit(int success, long code)
> +{
> + if (unlikely(audit_context()))
> + __audit_uring_exit(success, code);
> +}
> static inline void audit_syscall_entry(int major, unsigned long a0,
> unsigned long a1, unsigned long a2,
> unsigned long a3)
> @@ -554,6 +567,10 @@ static inline int audit_alloc(struct task_struct *task)
> {
> return 0;
> }
> +static inline int audit_alloc_kernel(struct task_struct *task)
> +{
> + return 0;
> +}
> static inline void audit_free(struct task_struct *task)
> { }
> static inline void audit_syscall_entry(int major, unsigned long a0,
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index cd2d8279a5e4..b26e0c435e8b 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -118,6 +118,7 @@
> #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */
> #define AUDIT_BPF 1334 /* BPF subsystem */
> #define AUDIT_EVENT_LISTENER 1335 /* Task joined multicast read socket */
> +#define AUDIT_URINGOP 1336 /* io_uring operation */
>
> #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */
> #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */
> diff --git a/kernel/audit.h b/kernel/audit.h
> index fba180de5912..50de827497ca 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -100,10 +100,12 @@ struct audit_context {
> enum {
> AUDIT_CTX_UNUSED, /* audit_context is currently unused */
> AUDIT_CTX_SYSCALL, /* in use by syscall */
> + AUDIT_CTX_URING, /* in use by io_uring */
> } context;
> enum audit_state state, current_state;
> unsigned int serial; /* serial number for record */
> int major; /* syscall number */
> + int uring_op; /* uring operation */
> struct timespec64 ctime; /* time of syscall entry */
> unsigned long argv[4]; /* syscall arguments */
> long return_code;/* syscall return code */
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index cc89e9f9a753..729849d41631 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -953,6 +953,7 @@ static void audit_reset_context(struct audit_context *ctx)
> ctx->current_state = ctx->state;
> ctx->serial = 0;
> ctx->major = 0;
> + ctx->uring_op = 0;
> ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
> memset(ctx->argv, 0, sizeof(ctx->argv));
> ctx->return_code = 0;
> @@ -1038,6 +1039,31 @@ int audit_alloc(struct task_struct *tsk)
> return 0;
> }
>
> +/**
> + * audit_alloc_kernel - allocate an audit_context for a kernel task
> + * @tsk: the kernel task
> + *
> + * Similar to the audit_alloc() function, but intended for kernel private
> + * threads. Returns zero on success, negative values on failure.
> + */
> +int audit_alloc_kernel(struct task_struct *tsk)
> +{
> + /*
> + * At the moment we are just going to call into audit_alloc() to
> + * simplify the code, but there two things to keep in mind with this
> + * approach:
> + *
> + * 1. Filtering internal kernel tasks is a bit laughable in almost all
> + * cases, but there is at least one case where there is a benefit:
> + * the '-a task,never' case allows the admin to effectively disable
> + * task auditing at runtime.
> + *
> + * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
> + * on these internal kernel tasks, but they probably don't hurt either.
> + */
> + return audit_alloc(tsk);
> +}
> +
> static inline void audit_free_context(struct audit_context *context)
> {
> /* resetting is extra work, but it is likely just noise */
> @@ -1536,6 +1562,52 @@ static void audit_log_proctitle(void)
> audit_log_end(ab);
> }
>
> +/**
> + * audit_log_uring - generate a AUDIT_URINGOP record
> + * @ctx: the audit context
> + */
> +static void audit_log_uring(struct audit_context *ctx)
> +{
> + struct audit_buffer *ab;
> + const struct cred *cred;
> +
> + /*
> + * TODO: What do we log here? I'm tossing in a few things to start the
> + * conversation, but additional thought needs to go into this.
> + */
> +
> + ab = audit_log_start(ctx, GFP_KERNEL, AUDIT_URINGOP);
> + if (!ab)
> + return;
> + cred = current_cred();
This may need to be req->work.creds. I haven't been following if the
io_uring thread inherited the user task's creds (and below, comm and
exe).
> + audit_log_format(ab, "uring_op=%d", ctx->uring_op);
arch is stored below in __audit_uring_entry() and never used in the
AUDIT_CTX_URING case. That assignment can either be dropped or printed
before uring_op similar to the SYSCALL record. There aren't really any
arg[0-3] to print.
io_uring_register and io_uring_setup() args are better covered by other
records. io_uring_enter() has 6 args and the last two aren't covered by
SYSCALL anyways.
> + if (ctx->return_valid != AUDITSC_INVALID)
> + audit_log_format(ab, " success=%s exit=%ld",
> + (ctx->return_valid == AUDITSC_SUCCESS ?
> + "yes" : "no"),
> + ctx->return_code);
> + audit_log_format(ab,
> + " items=%d"
> + " ppid=%d pid=%d auid=%u uid=%u gid=%u"
> + " euid=%u suid=%u fsuid=%u"
> + " egid=%u sgid=%u fsgid=%u",
> + ctx->name_count,
> + task_ppid_nr(current),
> + task_tgid_nr(current),
> + from_kuid(&init_user_ns, audit_get_loginuid(current)),
> + from_kuid(&init_user_ns, cred->uid),
> + from_kgid(&init_user_ns, cred->gid),
> + from_kuid(&init_user_ns, cred->euid),
> + from_kuid(&init_user_ns, cred->suid),
> + from_kuid(&init_user_ns, cred->fsuid),
> + from_kgid(&init_user_ns, cred->egid),
> + from_kgid(&init_user_ns, cred->sgid),
> + from_kgid(&init_user_ns, cred->fsgid));
The audit session ID is still important, relevant and qualifies auid.
In keeping with the SYSCALL record format, I think we want to keep
ses=audit_get_sessionid(current) in here.
I'm pretty sure we also want to keep comm= and exe= too, but may have to
reach into req->task to get it. There are two values for comm possible,
one from the original task and second "iou-sqp-<pid>" set at the top of
io_sq_thread().
I'm reluctant to leave them out now and then have to re-add them in yet
another field order later.
> + audit_log_task_context(ab);
> + audit_log_key(ab, ctx->filterkey);
> + audit_log_end(ab);
> +}
> +
> static void audit_log_exit(void)
> {
> int i, call_panic = 0;
> @@ -1571,6 +1643,9 @@ static void audit_log_exit(void)
> audit_log_key(ab, context->filterkey);
> audit_log_end(ab);
> break;
> + case AUDIT_CTX_URING:
> + audit_log_uring(context);
> + break;
> default:
> BUG();
> break;
> @@ -1740,6 +1815,104 @@ static void audit_return_fixup(struct audit_context *ctx,
> ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
> }
>
> +/**
> + * __audit_uring_entry - prepare the kernel task's audit context for io_uring
> + * @op: the io_uring opcode
> + *
> + * This is similar to audit_syscall_entry() but is intended for use by io_uring
> + * operations.
> + */
> +void __audit_uring_entry(u8 op)
> +{
> + struct audit_context *ctx = audit_context();
> +
> + if (!audit_enabled || !ctx || ctx->state == AUDIT_DISABLED)
> + return;
> +
> + /*
> + * NOTE: It's possible that we can be called from the process' context
> + * before it returns to userspace, and before audit_syscall_exit()
> + * is called. In this case there is not much to do, just record
> + * the io_uring details and return.
> + */
> + ctx->uring_op = op;
> + if (ctx->context == AUDIT_CTX_SYSCALL)
> + return;
> +
> + ctx->dummy = !audit_n_rules;
> + if (!ctx->dummy && ctx->state == AUDIT_BUILD_CONTEXT)
> + ctx->prio = 0;
> +
> + ctx->arch = syscall_get_arch(current);
> + ctx->context = AUDIT_CTX_URING;
> + ctx->current_state = ctx->state;
> + ktime_get_coarse_real_ts64(&ctx->ctime);
> +}
> +
> +/**
> + * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
> + * @success: true/false value to indicate if the operation succeeded or not
> + * @code: operation return code
> + *
> + * This is similar to audit_syscall_exit() but is intended for use by io_uring
> + * operations.
> + */
> +void __audit_uring_exit(int success, long code)
> +{
> + struct audit_context *ctx = audit_context();
> +
> + /*
> + * TODO: At some point we will likely want to filter on io_uring ops
> + * and other things similar to what we do for syscalls, but that
> + * is something for another day; just record what we can here.
> + */
> +
> + if (!ctx || ctx->dummy)
> + goto out;
> + if (ctx->context == AUDIT_CTX_SYSCALL) {
> + /*
> + * NOTE: See the note in __audit_uring_entry() about the case
> + * where we may be called from process context before we
> + * return to userspace via audit_syscall_exit(). In this
> + * case we simply emit a URINGOP record and bail, the
> + * normal syscall exit handling will take care of
> + * everything else.
> + * It is also worth mentioning that when we are called,
> + * the current process creds may differ from the creds
> + * used during the normal syscall processing; keep that
> + * in mind if/when we move the record generation code.
> + */
> +
> + /*
> + * We need to filter on the syscall info here to decide if we
> + * should emit a URINGOP record. I know it seems odd but this
> + * solves the problem where users have a filter to block *all*
> + * syscall records in the "exit" filter; we want to preserve
> + * the behavior here.
> + */
> + audit_filter_syscall(current, ctx);
> + audit_filter_inodes(current, ctx);
> + if (ctx->current_state != AUDIT_RECORD_CONTEXT)
> + return;
> +
> + audit_log_uring(ctx);
> + return;
> + }
> +
> + /* this may generate CONFIG_CHANGE records */
> + if (!list_empty(&ctx->killed_trees))
> + audit_kill_trees(ctx);
> +
> + audit_filter_inodes(current, ctx);
> + if (ctx->current_state != AUDIT_RECORD_CONTEXT)
> + goto out;
> + audit_return_fixup(ctx, success, code);
> + audit_log_exit();
> +
> +out:
> + audit_reset_context(ctx);
> +}
> +
> /**
> * __audit_syscall_entry - fill in an audit record at syscall entry
> * @major: major syscall type (function)
>
> --
> Linux-audit mailing list
> Linux-audit at redhat.com
> https://listman.redhat.com/mailman/listinfo/linux-audit
- RGB
--
Richard Guy Briggs <rgb at redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
More information about the Linux-security-module-archive
mailing list