[PATCH RESEND v3 bpf-next 01/14] bpf: introduce BPF token object

Tue Jul 4 12:43:53 UTC 2023

On Wed, Jun 28, 2023 at 10:18:19PM -0700, Andrii Nakryiko wrote:
> Add new kind of BPF kernel object, BPF token. BPF token is meant to to
> allow delegating privileged BPF functionality, like loading a BPF
> program or creating a BPF map, from privileged process to a *trusted*
> unprivileged process, all while have a good amount of control over which
> privileged operations could be performed using provided BPF token.
> 
> This patch adds new BPF_TOKEN_CREATE command to bpf() syscall, which
> allows to create a new BPF token object along with a set of allowed
> commands that such BPF token allows to unprivileged applications.
> Currently only BPF_TOKEN_CREATE command itself can be
> delegated, but other patches gradually add ability to delegate
> BPF_MAP_CREATE, BPF_BTF_LOAD, and BPF_PROG_LOAD commands.
> 
> The above means that new BPF tokens can be created using existing BPF
> token, if original privileged creator allowed BPF_TOKEN_CREATE command.
> New derived BPF token cannot be more powerful than the original BPF
> token.
> 
> Importantly, BPF token is automatically pinned at the specified location
> inside an instance of BPF FS and cannot be repinned using BPF_OBJ_PIN
> command, unlike BPF prog/map/btf/link. This provides more control over
> unintended sharing of BPF tokens through pinning it in another BPF FS
> instances.
> 
> Signed-off-by: Andrii Nakryiko <andrii at kernel.org>
> ---

The main issue I have with the token approach is that it is a completely
separate delegation vector on top of user namespaces. We mentioned this
duringthe conf and this was brought up on the thread here again as well.
Imho, that's a problem both security-wise and complexity-wise.

It's not great if each subsystem gets its own custom delegation
mechanism. This imposes such a taxing complexity on both kernel- and
userspace that it will quickly become a huge liability. So I would
really strongly encourage you to explore another direction.

I do think the spirit of your proposal is workable and that it can
mostly be kept in tact.

As mentioned before, bpffs has all the means to be taught delegation:

        // In container's user namespace
        fd_fs = fsopen("bpffs");

        // Delegating task in host userns (systemd-bpfd whatever you want)
        ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "delegate", ...);

        // In container's user namespace
        fd_mnt = fsmount(fd_fs, 0);

        ret = move_mount(fd_fs, "", -EBADF, "/my/fav/location", MOVE_MOUNT_F_EMPTY_PATH)

Roughly, this would mean:

(i) raise FS_USERNS_MOUNT on bpffs but guard it behind the "delegate"
    mount option. IOW, it's only possibly to mount bpffs as an
    unprivileged user if a delegating process like systemd-bpfd with
    system-level privileges has marked it as delegatable.
(ii) add fine-grained delegation options that you want this
     bpffs instance to allow via new mount options. Idk,

     // allow usage of foo
     fsconfig(fd_fs, FSCONFIG_SET_STRING, "abilities", "foo");

     // also allow usage of bar
     fsconfig(fd_fs, FSCONFIG_SET_STRING, "abilities", "bar");

     // reset allowed options
     fsconfig(fd_fs, FSCONFIG_SET_STRING, "");

     // allow usage of schmoo
     fsconfig(fd_fs, FSCONFIG_SET_STRING, "abilities", "schmoo");

This all seems more intuitive and integrates with user and mount
namespaces of the container. This can also work for restricting
non-userns bpf instances fwiw. You can also share instances via
bind-mount and so on. The userns of the bpffs instance can also be used
for permission checking provided a given functionality has been
delegated by e.g., systemd-bpfd or whatever.

So roughly - untested and unfinished:

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index b9b93b81af9a..c021b0a674bb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -623,15 +623,24 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
 }
 EXPORT_SYMBOL(bpf_prog_get_type_path);
 
+struct bpf_mount_opts {
+	umode_t mode;
+	bool delegate;
+	u64 abilities;
+};
+
 /*
  * Display the mount options in /proc/mounts.
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
+	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
 
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
+	if (opts->delegate)
+		seq_printf(m, ",delegate");
 	return 0;
 }
 
@@ -655,17 +664,17 @@ static const struct super_operations bpf_super_ops = {
 
 enum {
 	OPT_MODE,
+	Opt_delegate,
+	Opt_abilities,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
-	fsparam_u32oct	("mode",			OPT_MODE),
+	fsparam_u32oct	     ("mode",			OPT_MODE),
+	fsparam_flag_no	     ("delegate",		Opt_delegate),
+	fsparam_string       ("abilities",		Opt_abilities),
 	{}
 };
 
-struct bpf_mount_opts {
-	umode_t mode;
-};
-
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct bpf_mount_opts *opts = fc->fs_private;
@@ -694,6 +703,16 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
+	case Opt_delegate:
+		if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!result.negated)
+			opts->delegate = true;
+		break;
+	case Opt_abilities:
+		// parse param->string to opts->abilities
+		break;
 	}
 
 	return 0;
@@ -768,10 +787,20 @@ static int populate_bpffs(struct dentry *parent)
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = sb->s_fs_info;
 	struct inode *inode;
 	int ret;
 
+	if (fc->user_ns != &init_user_ns && !opts->delegate) {
+		errorfc(fc, "Can't mount bpffs without delegation permissions");
+		return -EPERM;
+	}
+
+	if (opts->abilities && !opts->delegate) {
+		errorfc(fc, "Specifying abilities without enabling delegation");
+		return -EINVAL;
+	}
+
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -793,7 +822,10 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	struct bpf_mount_opts *opts = fc->s_fs_info;
+
+	if (opts)
+		kfree(opts);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -815,17 +847,30 @@ static int bpf_init_fs_context(struct fs_context *fc)
 
 	opts->mode = S_IRWXUGO;
 
-	fc->fs_private = opts;
+	/* If an instance is delegated it will start with no abilities. */
+	opts->delegate = false;
+	opts->abilities = 0;
+
+	fc->s_fs_info = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
+static void bpf_kill_super(struct super_block *sb)
+{
+	struct bpf_mount_opts *opts = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	kfree(opts);
+}
+
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= bpf_kill_super,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 static int __init bpf_init(void)