[PATCH v2 01/10] vfs: move cap_convert_nscap() call into vfs_setxattr()

Mon Jan 11 13:49:16 UTC 2021

On Fri, Jan 01, 2021 at 11:35:16AM -0600, Eric W. Biederman wrote:
> Miklos Szeredi <mszeredi at redhat.com> writes:
> 
> > cap_convert_nscap() does permission checking as well as conversion of the
> > xattr value conditionally based on fs's user-ns.
> >
> > This is needed by overlayfs and probably other layered fs (ecryptfs) and is
> > what vfs_foo() is supposed to do anyway.
> 
> Well crap.
> 
> I just noticed this and it turns out this change is wrong.
> 
> The problem is that it reads the rootid from the v3 fscap, using
> current_user_ns() and then writes it using the sb->s_user_ns.
> 
> So any time the stacked filesystems sb->s_user_ns do not match or
> current_user_ns does not match sb->s_user_ns this could be a problem.
> 
> In a stacked filesystem a second pass through vfs_setxattr will result
> in the rootid being translated a second time (with potentially the wrong
> namespaces).  I think because of the security checks a we won't write
> something we shouldn't be able to write to the filesystem.  Still we
> will be writing the wrong v3 fscap which can go quite badly.
> 
> This doesn't look terribly difficult to fix.
> 
> Probably convert this into a fs independent form using uids in
> init_user_ns at input and have cap_convert_nscap convert the v3 fscap
> into the filesystem dependent form.  With some way for stackable
> filesystems to just skip converting it from the filesystem independent
> format.
> 
> Uids in xattrs that are expected to go directly to disk, but aren't
> always suitable for going directly to disk are tricky.

I've been looking at this for a couple of days and can't say I clearly
understand everything yet.

For one: a v2 fscap is supposed to be equivalent to a v3 fscap with a rootid of
zero, right?

If so, why does cap_inode_getsecurity() treat them differently (v2 fscap
succeeding unconditionally while v3 one being either converted to v2, rejected
or left as v3 depending on current_user_ns())?

Anyway, here's a patch that I think fixes getxattr() layering for
security.capability.  Does basically what you suggested.  Slight change of
semantics vs. v1 caps, not sure if that is still needed, getxattr()/setxattr()
hasn't worked for these since the introduction of v3 in 4.14.  Untested.

I still need to wrap my head around the permission requirements for the
setxattr() case...

Thanks,
Miklos

---
 fs/overlayfs/super.c       |   15 +++
 include/linux/capability.h |    2 
 include/linux/fs.h         |    1 
 security/commoncap.c       |  210 ++++++++++++++++++++++++---------------------
 4 files changed, 132 insertions(+), 96 deletions(-)

--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -395,6 +395,20 @@ static int ovl_remount(struct super_bloc
 	return ret;
 }
 
+static int ovl_cap_get(struct dentry *dentry,
+		       struct vfs_ns_cap_data *nscap)
+{
+	int res;
+	const struct cred *old_cred;
+	struct dentry *realdentry = ovl_dentry_real(dentry);
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	res = vfs_cap_get(realdentry, nscap);
+	revert_creds(old_cred);
+
+	return res;
+}
+
 static const struct super_operations ovl_super_operations = {
 	.alloc_inode	= ovl_alloc_inode,
 	.free_inode	= ovl_free_inode,
@@ -405,6 +419,7 @@ static const struct super_operations ovl
 	.statfs		= ovl_statfs,
 	.show_options	= ovl_show_options,
 	.remount_fs	= ovl_remount,
+	.cap_get	= ovl_cap_get,
 };
 
 enum {
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -272,4 +272,6 @@ extern int get_vfs_caps_from_disk(const
 
 extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
 
+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
+
 #endif /* !_LINUX_CAPABILITY_H */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1963,6 +1963,7 @@ struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	int (*cap_get)(struct dentry *dentry, struct vfs_ns_cap_data *nscap);
 };
 
 /*
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -341,6 +341,13 @@ static __u32 sansflags(__u32 m)
 	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
 }
 
+static bool is_v1header(size_t size, const struct vfs_cap_data *cap)
+{
+	if (size != XATTR_CAPS_SZ_1)
+		return false;
+	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_1;
+}
+
 static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
 {
 	if (size != XATTR_CAPS_SZ_2)
@@ -355,6 +362,72 @@ static bool is_v3header(size_t size, con
 	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
 }
 
+static bool validheader(size_t size, const struct vfs_cap_data *cap)
+{
+	return is_v1header(size, cap) || is_v2header(size, cap) || is_v3header(size, cap);
+}
+
+static void cap_to_v3(const struct vfs_cap_data *cap,
+			 struct vfs_ns_cap_data *nscap)
+{
+	u32 magic, nsmagic;
+
+	nsmagic = VFS_CAP_REVISION_3;
+	magic = le32_to_cpu(cap->magic_etc);
+	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+	nscap->magic_etc = cpu_to_le32(nsmagic);
+	nscap->rootid = cpu_to_le32(0);
+}
+
+static int default_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+	int err;
+	ssize_t size;
+	kuid_t kroot;
+	uid_t root, mappedroot;
+	char *tmpbuf = NULL;
+	struct vfs_cap_data *cap;
+	struct user_namespace *fs_ns = dentry->d_sb->s_user_ns;
+
+	size = vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf,
+				  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+	if (size < 0)
+		return size;
+
+	cap = (struct vfs_cap_data *) tmpbuf;
+	err = -EINVAL;
+	if (!validheader(size, cap))
+		goto out;
+
+	memset(nscap, 0, sizeof(*nscap));
+	memcpy(nscap, tmpbuf, size);
+	if (!is_v3header(size, cap))
+		cap_to_v3(cap, nscap);
+
+	/* Convert rootid from fs user namespace to init user namespace */
+	root = le32_to_cpu(nscap->rootid);
+	kroot = make_kuid(fs_ns, root);
+	mappedroot = from_kuid(&init_user_ns, kroot);
+	nscap->rootid = cpu_to_le32(mappedroot);
+
+	err = 0;
+out:
+	kfree(tmpbuf);
+	return err;
+}
+
+int vfs_cap_get(struct dentry *dentry, struct vfs_ns_cap_data *nscap)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	if (sb->s_op->cap_get)
+		return sb->s_op->cap_get(dentry, nscap);
+	else
+		return default_cap_get(dentry, nscap);
+}
+EXPORT_SYMBOL(vfs_cap_get);
+
 /*
  * getsecurity: We are called for security.* before any attempt to read the
  * xattr from the inode itself.
@@ -369,14 +442,15 @@ static bool is_v3header(size_t size, con
 int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
 			  bool alloc)
 {
-	int size, ret;
+	int ret;
+	ssize_t size;
 	kuid_t kroot;
+	__le32 nsmagic, magic;
 	uid_t root, mappedroot;
-	char *tmpbuf = NULL;
+	void *tmpbuf = NULL;
 	struct vfs_cap_data *cap;
-	struct vfs_ns_cap_data *nscap;
+	struct vfs_ns_cap_data nscap;
 	struct dentry *dentry;
-	struct user_namespace *fs_ns;
 
 	if (strcmp(name, "capability") != 0)
 		return -EOPNOTSUPP;
@@ -385,67 +459,50 @@ int cap_inode_getsecurity(struct inode *
 	if (!dentry)
 		return -EINVAL;
 
-	size = sizeof(struct vfs_ns_cap_data);
-	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
-				 &tmpbuf, size, GFP_NOFS);
+	ret = vfs_cap_get(dentry, &nscap);
 	dput(dentry);
 
 	if (ret < 0)
 		return ret;
 
-	fs_ns = inode->i_sb->s_user_ns;
-	cap = (struct vfs_cap_data *) tmpbuf;
-	if (is_v2header((size_t) ret, cap)) {
-		/* If this is sizeof(vfs_cap_data) then we're ok with the
-		 * on-disk value, so return that.  */
-		if (alloc)
-			*buffer = tmpbuf;
-		else
-			kfree(tmpbuf);
-		return ret;
-	} else if (!is_v3header((size_t) ret, cap)) {
-		kfree(tmpbuf);
-		return -EINVAL;
-	}
+	tmpbuf = kmalloc(sizeof(struct vfs_ns_cap_data), GFP_NOFS);
+	if (!tmpbuf)
+		return -ENOMEM;
 
-	nscap = (struct vfs_ns_cap_data *) tmpbuf;
-	root = le32_to_cpu(nscap->rootid);
-	kroot = make_kuid(fs_ns, root);
+	root = le32_to_cpu(nscap.rootid);
+	kroot = make_kuid(&init_user_ns, root);
 
 	/* If the root kuid maps to a valid uid in current ns, then return
 	 * this as a nscap. */
 	mappedroot = from_kuid(current_user_ns(), kroot);
 	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+		size = sizeof(struct vfs_cap_data);
 		if (alloc) {
 			*buffer = tmpbuf;
-			nscap->rootid = cpu_to_le32(mappedroot);
-		} else
-			kfree(tmpbuf);
-		return size;
+			tmpbuf = NULL;
+			nscap.rootid = cpu_to_le32(mappedroot);
+			memcpy(*buffer, &nscap, size);
+		}
+		goto out;
 	}
 
-	if (!rootid_owns_currentns(kroot)) {
-		kfree(tmpbuf);
-		return -EOPNOTSUPP;
-	}
+	size = -EOPNOTSUPP;
+	if (!rootid_owns_currentns(kroot))
+		goto out;
 
 	/* This comes from a parent namespace.  Return as a v2 capability */
 	size = sizeof(struct vfs_cap_data);
 	if (alloc) {
-		*buffer = kmalloc(size, GFP_ATOMIC);
-		if (*buffer) {
-			struct vfs_cap_data *cap = *buffer;
-			__le32 nsmagic, magic;
-			magic = VFS_CAP_REVISION_2;
-			nsmagic = le32_to_cpu(nscap->magic_etc);
-			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
-				magic |= VFS_CAP_FLAGS_EFFECTIVE;
-			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
-			cap->magic_etc = cpu_to_le32(magic);
-		} else {
-			size = -ENOMEM;
-		}
+		cap = *buffer = tmpbuf;
+		tmpbuf = NULL;
+		magic = VFS_CAP_REVISION_2;
+		nsmagic = le32_to_cpu(nscap.magic_etc);
+		if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+			magic |= VFS_CAP_FLAGS_EFFECTIVE;
+		memcpy(&cap->data, &nscap.data, sizeof(__le32) * 2 * VFS_CAP_U32);
+		cap->magic_etc = cpu_to_le32(magic);
 	}
+out:
 	kfree(tmpbuf);
 	return size;
 }
@@ -462,11 +519,6 @@ static kuid_t rootid_from_xattr(const vo
 	return make_kuid(task_ns, rootid);
 }
 
-static bool validheader(size_t size, const struct vfs_cap_data *cap)
-{
-	return is_v2header(size, cap) || is_v3header(size, cap);
-}
-
 /*
  * User requested a write of security.capability.  If needed, update the
  * xattr to change from v2 to v3, or to fixup the v3 rootid.
@@ -570,74 +622,40 @@ static inline int bprm_caps_from_vfs_cap
 int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
 {
 	struct inode *inode = d_backing_inode(dentry);
-	__u32 magic_etc;
-	unsigned tocopy, i;
-	int size;
-	struct vfs_ns_cap_data data, *nscaps = &data;
-	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
-	kuid_t rootkuid;
-	struct user_namespace *fs_ns;
+	unsigned int i;
+	int ret;
+	struct vfs_ns_cap_data nscaps;
 
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 
 	if (!inode)
 		return -ENODATA;
 
-	fs_ns = inode->i_sb->s_user_ns;
-	size = __vfs_getxattr((struct dentry *)dentry, inode,
-			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
-	if (size == -ENODATA || size == -EOPNOTSUPP)
+	ret = vfs_cap_get((struct dentry *) dentry, &nscaps);
+	if (ret == -ENODATA || ret == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
 
-	if (size < 0)
-		return size;
-
-	if (size < sizeof(magic_etc))
-		return -EINVAL;
-
-	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
+	if (ret < 0)
+		return ret;
 
-	rootkuid = make_kuid(fs_ns, 0);
-	switch (magic_etc & VFS_CAP_REVISION_MASK) {
-	case VFS_CAP_REVISION_1:
-		if (size != XATTR_CAPS_SZ_1)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_1;
-		break;
-	case VFS_CAP_REVISION_2:
-		if (size != XATTR_CAPS_SZ_2)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_2;
-		break;
-	case VFS_CAP_REVISION_3:
-		if (size != XATTR_CAPS_SZ_3)
-			return -EINVAL;
-		tocopy = VFS_CAP_U32_3;
-		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
-		break;
+	cpu_caps->magic_etc = le32_to_cpu(nscaps.magic_etc);
+	cpu_caps->rootid = make_kuid(&init_user_ns, le32_to_cpu(nscaps.rootid));
 
-	default:
-		return -EINVAL;
-	}
 	/* Limit the caps to the mounter of the filesystem
 	 * or the more limited uid specified in the xattr.
 	 */
-	if (!rootid_owns_currentns(rootkuid))
+	if (!rootid_owns_currentns(cpu_caps->rootid))
 		return -ENODATA;
 
 	CAP_FOR_EACH_U32(i) {
-		if (i >= tocopy)
-			break;
-		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
-		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
+		cpu_caps->permitted.cap[i] = le32_to_cpu(nscaps.data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(nscaps.data[i].inheritable);
 	}
 
 	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
 	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
 
-	cpu_caps->rootid = rootkuid;
-
 	return 0;
 }