file metadata via fs API (was: [GIT PULL] Filesystem Information)

Miklos Szeredi miklos at szeredi.hu
Tue Aug 11 13:54:19 UTC 2020


On Wed, Aug 05, 2020 at 10:24:23AM +0200, Miklos Szeredi wrote:
> On Tue, Aug 4, 2020 at 4:36 PM Miklos Szeredi <miklos at szeredi.hu> wrote:
> 
> > I think we already lost that with the xattr API, that should have been
> > done in a way that fits this philosophy.  But given that we  have "/"
> > as the only special purpose char in filenames, and even repetitions
> > are allowed, it's hard to think of a good way to do that.  Pity.
> 
> One way this could be solved is to allow opting into an alternative
> path resolution mode.
> 
> E.g.
>   openat(AT_FDCWD, "foo/bar//mnt/info", O_RDONLY | O_ALT);

Proof of concept patch and test program below.

Opted for triple slash in the hope that just maybe we could add a global
/proc/sys/fs/resolve_alt knob to optionally turn on alternative (non-POSIX) path
resolution without breaking too many things.  Will try that later...

Comments?

Thanks,
Miklos

cat_alt.c:
-------- >8 --------
#define _GNU_SOURCE
#include <err.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <linux/openat2.h>

#define RESOLVE_ALT		0x20 /* Alternative path walk mode where
					multiple slashes have special meaning */

int main(int argc, char *argv[])
{
	struct open_how how = {
		.flags = O_RDONLY,
		.resolve = RESOLVE_ALT,
	};
	int fd, res, i;
	char buf[65536], *end;
	const char *path = argv[1];
	int dfd = AT_FDCWD;

	if (argc < 2 || argc > 4)
		errx(1, "usage: %s path [dirfd] [--nofollow]", argv[0]);


	for (i = 2; i < argc; i++) {
		if (strcmp(argv[i], "--nofollow") == 0) {
			how.flags |= O_NOFOLLOW;
		} else {
			dfd = strtoul(argv[i], &end, 0);
			if (end == argv[i] || *end)
				errx(1, "invalid dirfd: %s", argv[i]);
		}
	}

	fd = syscall(__NR_openat2, dfd, path, &how, sizeof(how));
	if (fd == -1)
		err(1, "failed to open %s", argv[1]);

	while (1) {
		res = read(fd, buf, sizeof(buf));
		if (res == -1)
			err(1, "failed to read file");
		if (res == 0)
			break;

		write(1, buf, res);
	}
	close(fd);
	return 0;
}
-------- >8 --------

---
 fs/Makefile                  |    2 
 fs/file_table.c              |   70 ++++++++++++++--------
 fs/fsmeta.c                  |  135 +++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h                |    9 ++
 fs/mount.h                   |    4 +
 fs/namei.c                   |   77 +++++++++++++++++++++---
 fs/namespace.c               |   12 +++
 fs/open.c                    |    2 
 fs/proc_namespace.c          |    2 
 include/linux/fcntl.h        |    2 
 include/linux/namei.h        |    3 
 include/uapi/linux/magic.h   |    1 
 include/uapi/linux/openat2.h |    2 
 13 files changed, 282 insertions(+), 39 deletions(-)

--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y :=	open.o read_write.o file_table.
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_types.o fs_context.o fs_parser.o fsopen.o
+		fs_types.o fs_context.o fs_parser.o fsopen.o fsmeta.o \
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -178,22 +178,9 @@ struct file *alloc_empty_file_noaccount(
 	return f;
 }
 
-/**
- * alloc_file - allocate and initialize a 'struct file'
- *
- * @path: the (dentry, vfsmount) pair for the new file
- * @flags: O_... flags with which the new file will be opened
- * @fop: the 'struct file_operations' for the new file
- */
-static struct file *alloc_file(const struct path *path, int flags,
-		const struct file_operations *fop)
+static void init_file(struct file *file, const struct path *path, int flags,
+		      const struct file_operations *fop)
 {
-	struct file *file;
-
-	file = alloc_empty_file(flags, current_cred());
-	if (IS_ERR(file))
-		return file;
-
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -209,31 +196,66 @@ static struct file *alloc_file(const str
 	file->f_op = fop;
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @flags: O_... flags with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+static struct file *alloc_file(const struct path *path, int flags,
+		const struct file_operations *fop)
+{
+	struct file *file;
+
+	file = alloc_empty_file(flags, current_cred());
+	if (IS_ERR(file))
+		return file;
+
+	init_file(file, path, flags, fop);
+
 	return file;
 }
 
-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
-				const char *name, int flags,
-				const struct file_operations *fops)
+int init_file_pseudo(struct file *file, struct inode *inode,
+		     struct vfsmount *mnt, const char *name, int flags,
+		     const struct file_operations *fops)
 {
 	static const struct dentry_operations anon_ops = {
 		.d_dname = simple_dname
 	};
 	struct qstr this = QSTR_INIT(name, strlen(name));
 	struct path path;
-	struct file *file;
 
 	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
 	if (!path.dentry)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	if (!mnt->mnt_sb->s_d_op)
 		d_set_d_op(path.dentry, &anon_ops);
 	path.mnt = mntget(mnt);
 	d_instantiate(path.dentry, inode);
-	file = alloc_file(&path, flags, fops);
-	if (IS_ERR(file)) {
-		ihold(inode);
-		path_put(&path);
+	init_file(file, &path, flags, fops);
+
+	return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+				const char *name, int flags,
+				const struct file_operations *fops)
+{
+	struct file *file;
+	int err;
+
+	file = alloc_empty_file(flags, current_cred());
+	if (IS_ERR(file))
+		return file;
+
+	err = init_file_pseudo(file, inode, mnt, name, flags, fops);
+	if (err) {
+		fput(file);
+		file = ERR_PTR(err);
 	}
 	return file;
 }
--- /dev/null
+++ b/fs/fsmeta.c
@@ -0,0 +1,135 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/seq_file.h>
+#include <linux/fs_struct.h>
+#include <linux/pseudo_fs.h>
+
+#include "mount.h"
+#include "internal.h"
+
+static struct vfsmount *fsmeta_mnt;
+static struct inode *fsmeta_inode;
+
+
+static struct vfsmount *fsmeta_mnt_info_get_mnt(struct seq_file *seq)
+{
+	struct proc_mounts *p = seq->private;
+
+	return &list_entry(p->cursor.mnt_list.next, struct mount, mnt_list)->mnt;
+}
+
+static void *fsmeta_mnt_info_start(struct seq_file *seq, loff_t *pos)
+{
+	mnt_namespace_lock_read();
+	return *pos == 0 ? fsmeta_mnt_info_get_mnt(seq) : NULL;
+}
+
+static void *fsmeta_mnt_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void fsmeta_mnt_info_stop(struct seq_file *seq, void *v)
+{
+	mnt_namespace_unlock_read();
+}
+
+static int fsmeta_mnt_info_show(struct seq_file *seq, void *v)
+{
+	return show_mountinfo(seq, v);
+}
+
+static const struct seq_operations fsmeta_mnt_info_sops = {
+	.start = fsmeta_mnt_info_start,
+	.next = fsmeta_mnt_info_next,
+	.stop = fsmeta_mnt_info_stop,
+	.show = fsmeta_mnt_info_show,
+};
+
+static int fsmeta_mnt_info_release(struct inode *inode, struct file *file)
+{
+	if (file->private_data) {
+		struct seq_file *seq = file->private_data;
+		struct proc_mounts *p = seq->private;
+
+		mntput(fsmeta_mnt_info_get_mnt(seq));
+		path_put(&p->root);
+
+		return seq_release_private(inode, file);
+	}
+	return 0;
+}
+
+static const struct file_operations fsmeta_mnt_info_fops = {
+	.release = fsmeta_mnt_info_release,
+	.read = seq_read,
+	.llseek = no_llseek,
+};
+
+static int fsmeta_mnt_info_open(struct file *file, const struct path *path,
+				const struct open_flags *op)
+{
+	struct proc_mounts *p;
+	int err;
+
+	err = init_file_pseudo(file, fsmeta_inode, fsmeta_mnt, "[mnt.info]",
+			       op->open_flag, &fsmeta_mnt_info_fops);
+	if (err)
+		return err;
+	/*
+	 * This reference is now sunk in file->f_path.dentry->d_inode and will
+	 * be released by fput()
+	 */
+	ihold(fsmeta_inode);
+
+	err = seq_open_private(file, &fsmeta_mnt_info_sops, sizeof(*p));
+	if (err)
+		return err;
+
+	p = ((struct seq_file *)file->private_data)->private;
+	get_fs_root(current->fs, &p->root);
+	p->cursor.mnt_list.next = &real_mount(mntget(path->mnt))->mnt_list;
+
+	return 0;
+}
+
+int fsmeta_open(const char *meta_name, const struct path *path,
+	      struct file *file, const struct open_flags *op)
+{
+	if (op->open_flag & ~(O_LARGEFILE | O_CLOEXEC | O_NOFOLLOW))
+		return -EINVAL;
+
+	if (strcmp(meta_name, "mnt/info") == 0)
+		return fsmeta_mnt_info_open(file, path, op);
+
+	pr_info("invalid fsmeta file <%s> on %pd4\n", meta_name, path->dentry);
+	return -EINVAL;
+}
+
+static int fsmeta_init_fs_context(struct fs_context *fc)
+{
+	return init_pseudo(fc, FSMETA_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type fsmeta_fs_type = {
+	.name		= "fsmeta",
+	.init_fs_context = fsmeta_init_fs_context,
+	.kill_sb	= kill_anon_super,
+};
+
+static int __init fsmeta_init(void)
+{
+	fsmeta_mnt = kern_mount(&fsmeta_fs_type);
+	if (IS_ERR(fsmeta_mnt))
+		panic("fsmeta_init() kernel mount failed (%ld)\n", PTR_ERR(fsmeta_mnt));
+
+	fsmeta_inode = alloc_anon_inode(fsmeta_mnt->mnt_sb);
+	if (IS_ERR(fsmeta_inode))
+		panic("fsmeta_init() inode allocation failed (%ld)\n", PTR_ERR(fsmeta_inode));
+
+	return 0;
+}
+fs_initcall(fsmeta_init);
+
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -99,6 +99,9 @@ extern void chroot_fs_refs(const struct
  */
 extern struct file *alloc_empty_file(int, const struct cred *);
 extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+extern int init_file_pseudo(struct file *file, struct inode *inode,
+			    struct vfsmount *mnt, const char *name, int flags,
+			    const struct file_operations *fops);
 
 /*
  * super.c
@@ -185,3 +188,9 @@ int sb_init_dio_done_wq(struct super_blo
  */
 int do_statx(int dfd, const char __user *filename, unsigned flags,
 	     unsigned int mask, struct statx __user *buffer);
+
+/*
+ * fs/fsmeta.c
+ */
+int fsmeta_open(const char *meta_name, const struct path *path,
+		struct file *file, const struct open_flags *op);
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -159,3 +159,7 @@ static inline bool is_anon_ns(struct mnt
 }
 
 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+
+void mnt_namespace_lock_read(void);
+void mnt_namespace_unlock_read(void);
+int show_mountinfo(struct seq_file *m, struct vfsmount *mnt);
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2094,6 +2094,30 @@ static inline u64 hash_name(const void *
 
 #endif
 
+static int lookup_alt(const char *name, struct nameidata *nd)
+{
+	if ((nd->flags & LOOKUP_RCU) && unlazy_walk(nd) != 0)
+		return -ECHILD;
+
+	nd->last.name = name + 3;
+	nd->last_type = LAST_META;
+
+	return 0;
+}
+
+static bool is_alt(const char *name, struct nameidata *nd, int depth)
+{
+	if (!(nd->flags & LOOKUP_ALT))
+		return false;
+
+	/* no alternative lookup inside symlinks */
+	if (depth)
+		return false;
+
+	/* name[0] has already been verified to be a slash */
+	return name[1] == '/' && name[2] == '/' && name[3] != '/';
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -2111,8 +2135,13 @@ static int link_path_walk(const char *na
 	nd->flags |= LOOKUP_PARENT;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
-	while (*name=='/')
-		name++;
+	if (*name == '/') {
+		if (!is_alt(name, nd, depth)) {
+			do {
+				name++;
+			} while (*name == '/');
+		}
+	}
 	if (!*name)
 		return 0;
 
@@ -2122,6 +2151,9 @@ static int link_path_walk(const char *na
 		u64 hash_len;
 		int type;
 
+		if (*name == '/')
+			return lookup_alt(name, nd);
+
 		err = may_lookup(nd);
 		if (err)
 			return err;
@@ -2163,6 +2195,13 @@ static int link_path_walk(const char *na
 		 * If it wasn't NUL, we know it was '/'. Skip that
 		 * slash, and continue until no more slashes.
 		 */
+		if (is_alt(name, nd, depth)) {
+			link = walk_component(nd, WALK_TRAILING);
+			if (unlikely(link))
+				goto LINK;
+
+			return lookup_alt(name, nd);
+		}
 		do {
 			name++;
 		} while (unlikely(*name == '/'));
@@ -2183,6 +2222,7 @@ static int link_path_walk(const char *na
 			link = walk_component(nd, WALK_MORE);
 		}
 		if (unlikely(link)) {
+LINK:
 			if (IS_ERR(link))
 				return PTR_ERR(link);
 			/* a symlink to follow */
@@ -2239,11 +2279,11 @@ static const char *path_init(struct name
 	nd->path.dentry = NULL;
 
 	/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
-	if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
+	if (*s == '/' && !is_alt(s, nd, 0) && !(flags & LOOKUP_IN_ROOT)) {
 		error = nd_jump_root(nd);
 		if (unlikely(error))
 			return ERR_PTR(error);
-		return s;
+		return s + 1;
 	}
 
 	/* Relative pathname -- get the starting-point it is relative to. */
@@ -2272,7 +2312,8 @@ static const char *path_init(struct name
 
 		dentry = f.file->f_path.dentry;
 
-		if (*s && unlikely(!d_can_lookup(dentry))) {
+		if (*s && unlikely(!d_can_lookup(dentry)) &&
+		    !is_alt(s, nd, 0)) {
 			fdput(f);
 			return ERR_PTR(-ENOTDIR);
 		}
@@ -2303,6 +2344,9 @@ static const char *path_init(struct name
 
 static inline const char *lookup_last(struct nameidata *nd)
 {
+	if (nd->last_type == LAST_META)
+		return ERR_PTR(-EINVAL);
+
 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
@@ -2331,7 +2375,7 @@ static int path_lookupat(struct nameidat
 
 	while (!(err = link_path_walk(s, nd)) &&
 	       (s = lookup_last(nd)) != NULL)
-		;
+		nd->flags &= ~LOOKUP_ALT;
 	if (!err)
 		err = complete_walk(nd);
 
@@ -2410,9 +2454,15 @@ static struct filename *filename_parenta
 	if (unlikely(retval == -ESTALE))
 		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
 	if (likely(!retval)) {
-		*last = nd.last;
-		*type = nd.last_type;
-		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+		if (nd.last_type == LAST_META) {
+			path_put(parent);
+			putname(name);
+			name = ERR_PTR(-EINVAL);
+		} else {
+			*last = nd.last;
+			*type = nd.last_type;
+			audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+		}
 	} else {
 		putname(name);
 		name = ERR_PTR(retval);
@@ -3123,6 +3173,10 @@ static const char *open_last_lookups(str
 	nd->flags |= op->intent;
 
 	if (nd->last_type != LAST_NORM) {
+		if (nd->last_type == LAST_META) {
+			return ERR_PTR(fsmeta_open(nd->last.name, &nd->path,
+						   file, op));
+		}
 		if (nd->depth)
 			put_link(nd);
 		return handle_dots(nd, nd->last_type);
@@ -3206,6 +3260,9 @@ static int do_open(struct nameidata *nd,
 	int acc_mode;
 	int error;
 
+	if (nd->last_type == LAST_META)
+		return 0;
+
 	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
 		error = complete_walk(nd);
 		if (error)
@@ -3355,7 +3412,7 @@ static struct file *path_openat(struct n
 		const char *s = path_init(nd, flags);
 		while (!(error = link_path_walk(s, nd)) &&
 		       (s = open_last_lookups(nd, file, op)) != NULL)
-			;
+			nd->flags &= ~LOOKUP_ALT;
 		if (!error)
 			error = do_open(nd, file, op);
 		terminate_walk(nd);
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -69,7 +69,7 @@ static DEFINE_IDA(mnt_group_ida);
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static DECLARE_RWSEM(namespace_sem);
+DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
 
@@ -1435,6 +1435,16 @@ static inline void namespace_lock(void)
 	down_write(&namespace_sem);
 }
 
+void mnt_namespace_lock_read(void)
+{
+	down_read(&namespace_sem);
+}
+
+void mnt_namespace_unlock_read(void)
+{
+	up_read(&namespace_sem);
+}
+
 enum umount_tree_flags {
 	UMOUNT_SYNC = 1,
 	UMOUNT_PROPAGATE = 2,
--- a/fs/open.c
+++ b/fs/open.c
@@ -1098,6 +1098,8 @@ inline int build_open_flags(const struct
 		lookup_flags |= LOOKUP_BENEATH;
 	if (how->resolve & RESOLVE_IN_ROOT)
 		lookup_flags |= LOOKUP_IN_ROOT;
+	if (how->resolve & RESOLVE_ALT)
+		lookup_flags |= LOOKUP_ALT;
 
 	op->lookup_flags = lookup_flags;
 	return 0;
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -128,7 +128,7 @@ static int show_vfsmnt(struct seq_file *
 	return err;
 }
 
-static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -19,7 +19,7 @@
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
-	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_ALT)
 
 /* List of all open_how "versions". */
 #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -15,7 +15,7 @@ enum { MAX_NESTED_LINKS = 8 };
 /*
  * Type of the last component on LOOKUP_PARENT
  */
-enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
+enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_META};
 
 /* pathwalk mode */
 #define LOOKUP_FOLLOW		0x0001	/* follow links at the end */
@@ -27,6 +27,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
 
 #define LOOKUP_REVAL		0x0020	/* tell ->d_revalidate() to trust no cache */
 #define LOOKUP_RCU		0x0040	/* RCU pathwalk mode; semi-internal */
+#define LOOKUP_ALT		0x200000 /* Alternative path walk mode */
 
 /* These tell filesystem methods that we are dealing with the final component... */
 #define LOOKUP_OPEN		0x0100	/* ... in open */
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -88,6 +88,7 @@
 #define BPF_FS_MAGIC		0xcafe4a11
 #define AAFS_MAGIC		0x5a3c69f0
 #define ZONEFS_MAGIC		0x5a4f4653
+#define FSMETA_MAGIC		0x9f8ea387
 
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -35,5 +35,7 @@ struct open_how {
 #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
 					be scoped inside the dirfd
 					(similar to chroot(2)). */
+#define RESOLVE_ALT		0x20 /* Alternative path walk mode where
+					multiple slashes have special meaning */
 
 #endif /* _UAPI_LINUX_OPENAT2_H */



More information about the Linux-security-module-archive mailing list