[PATCH 1/7] General notification queue with user mmap()'able ring buffer

Greg KH gregkh at linuxfoundation.org
Tue May 28 16:26:03 UTC 2019


On Tue, May 28, 2019 at 05:01:55PM +0100, David Howells wrote:
> Implement a misc device that implements a general notification queue as a
> ring buffer that can be mmap()'d from userspace.

"general" but just for filesystems, right?  :(

> Each entry has a 1-slot header that describes it:
> 
> 	struct watch_notification {
> 		__u32	type:24;
> 		__u32	subtype:8;
> 		__u32	info;
> 	};

This doesn't match the structure definition in the documentation, so
something is out of sync.

> The type indicates the source (eg. mount tree changes, superblock events,
> keyring changes, block layer events) and the subtype indicates the event
> type (eg. mount, unmount; EIO, EDQUOT; link, unlink).  The info field
> indicates a number of things, including the entry length, an ID assigned to
> a watchpoint contributing to this buffer, type-specific flags and meta
> flags, such as an overrun indicator.
> 
> Supplementary data, such as the key ID that generated an event, are
> attached in additional slots.

I'm all for a "generic" event system for the kernel (heck, Solaris has
had one for decades), but it keeps getting shot down every time it comes
up.  What is different about this one?

> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -4,6 +4,19 @@
>  
>  menu "Misc devices"
>  
> +config WATCH_QUEUE
> +	bool "Mappable notification queue"
> +	default n

Nit, not needed.

> +	depends on MMU
> +	help
> +	  This is a general notification queue for the kernel to pass events to
> +	  userspace through a mmap()'able ring buffer.  It can be used in
> +	  conjunction with watches for mount topology change notifications,
> +	  superblock change notifications and key/keyring change notifications.
> +
> +	  Note that in theory this should work fine with NOMMU, but I'm not
> +	  sure how to make that work.
> +
>  config SENSORS_LIS3LV02D
>  	tristate
>  	depends on INPUT
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index b9affcdaa3d6..bf16acd9f8cc 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -3,6 +3,7 @@
>  # Makefile for misc devices that really don't fit anywhere else.
>  #
>  
> +obj-$(CONFIG_WATCH_QUEUE)	+= watch_queue.o
>  obj-$(CONFIG_IBM_ASM)		+= ibmasm/
>  obj-$(CONFIG_IBMVMC)		+= ibmvmc.o
>  obj-$(CONFIG_AD525X_DPOT)	+= ad525x_dpot.o
> diff --git a/drivers/misc/watch_queue.c b/drivers/misc/watch_queue.c
> new file mode 100644
> index 000000000000..39a09ea15d97
> --- /dev/null
> +++ b/drivers/misc/watch_queue.c
> @@ -0,0 +1,877 @@
> +/* User-mappable watch queue
> + *
> + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.

You didn't touch the code this year?

> + * Written by David Howells (dhowells at redhat.com)
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.

Please drop the boiler plate text and use a SPDX tag, checkpatch should
have caught this.  I don't want to have to go and change it again.

> + *
> + * See Documentation/watch_queue.rst
> + */
> +
> +#define pr_fmt(fmt) "watchq: " fmt
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/printk.h>
> +#include <linux/miscdevice.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/pagemap.h>
> +#include <linux/poll.h>
> +#include <linux/uaccess.h>
> +#include <linux/vmalloc.h>
> +#include <linux/file.h>
> +#include <linux/security.h>
> +#include <linux/cred.h>
> +#include <linux/watch_queue.h>
> +
> +#define DEBUG_WITH_WRITE /* Allow use of write() to record notifications */

debugging code left in?

> +
> +MODULE_DESCRIPTION("Watch queue");
> +MODULE_AUTHOR("Red Hat, Inc.");
> +MODULE_LICENSE("GPL");
> +
> +struct watch_type_filter {
> +	enum watch_notification_type type;
> +	__u32		subtype_filter[1];	/* Bitmask of subtypes to filter on */
> +	__u32		info_filter;		/* Filter on watch_notification::info */
> +	__u32		info_mask;		/* Mask of relevant bits in info_filter */
> +};
> +
> +struct watch_filter {
> +	union {
> +		struct rcu_head	rcu;
> +		unsigned long	type_filter[2];	/* Bitmask of accepted types */
> +	};
> +	u32		nr_filters;		/* Number of filters */
> +	struct watch_type_filter filters[];
> +};
> +
> +struct watch_queue {
> +	struct rcu_head		rcu;
> +	struct address_space	mapping;
> +	const struct cred	*cred;		/* Creds of the owner of the queue */
> +	struct watch_filter __rcu *filter;
> +	wait_queue_head_t	waiters;
> +	struct hlist_head	watches;	/* Contributory watches */
> +	refcount_t		usage;

Usage of what, this structure?  Or something else?

> +	spinlock_t		lock;
> +	bool			defunct;	/* T when queues closed */
> +	u8			nr_pages;	/* Size of pages[] */
> +	u8			flag_next;	/* Flag to apply to next item */
> +#ifdef DEBUG_WITH_WRITE
> +	u8			debug;
> +#endif
> +	u32			size;
> +	struct watch_queue_buffer *buffer;	/* Pointer to first record */
> +
> +	/* The mappable pages.  The zeroth page holds the ring pointers. */
> +	struct page		**pages;
> +};


> +EXPORT_SYMBOL(__post_watch_notification);

_GPL for new apis?  (I have to ask...)

> +static long watch_queue_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	struct watch_queue *wqueue = file->private_data;
> +	struct inode *inode = file_inode(file);
> +	long ret;
> +
> +	switch (cmd) {
> +	case IOC_WATCH_QUEUE_SET_SIZE:
> +		if (wqueue->buffer)
> +			return -EBUSY;
> +		inode_lock(inode);
> +		ret = watch_queue_set_size(wqueue, arg);
> +		inode_unlock(inode);
> +		return ret;
> +
> +	case IOC_WATCH_QUEUE_SET_FILTER:
> +		inode_lock(inode);
> +		ret = watch_queue_set_filter(
> +			inode, wqueue,
> +			(struct watch_notification_filter __user *)arg);
> +		inode_unlock(inode);
> +		return ret;
> +
> +	default:
> +		return -EOPNOTSUPP;

-ENOTTY is the correct "not a valid ioctl" error value, right?

> +	}
> +}

> +/**
> + * put_watch_queue - Dispose of a ref on a watchqueue.
> + * @wqueue: The watch queue to unref.
> + */
> +void put_watch_queue(struct watch_queue *wqueue)
> +{
> +	if (refcount_dec_and_test(&wqueue->usage))
> +		kfree_rcu(wqueue, rcu);

Why not just use a kref?

> +}
> +EXPORT_SYMBOL(put_watch_queue);


> +int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
> +{
> +	struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
> +	struct watch *w;
> +
> +	hlist_for_each_entry(w, &wlist->watchers, list_node) {
> +		if (watch->id == w->id)
> +			return -EBUSY;
> +	}
> +
> +	rcu_assign_pointer(watch->watch_list, wlist);
> +
> +	spin_lock_bh(&wqueue->lock);
> +	refcount_inc(&wqueue->usage);
> +	hlist_add_head(&watch->queue_node, &wqueue->watches);
> +	spin_unlock_bh(&wqueue->lock);
> +
> +	hlist_add_head(&watch->list_node, &wlist->watchers);
> +	return 0;
> +}
> +EXPORT_SYMBOL(add_watch_to_object);

Naming nit, shouldn't the "prefix" all be the same for these new
functions?

watch_queue_add_object()?  watch_queue_put()?  And so on?

> +static int __init watch_queue_init(void)
> +{
> +	int ret;
> +
> +	ret = misc_register(&watch_queue_dev);
> +	if (ret < 0)
> +		pr_err("Failed to register %d\n", ret);
> +	return ret;
> +}
> +fs_initcall(watch_queue_init);
> +
> +static void __exit watch_queue_exit(void)
> +{
> +	misc_deregister(&watch_queue_dev);
> +}
> +module_exit(watch_queue_exit);

module_misc_device()?


> --- /dev/null
> +++ b/include/linux/watch_queue.h
> @@ -0,0 +1,86 @@
> +/* User-mappable watch queue
> + *
> + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
> + * Written by David Howells (dhowells at redhat.com)
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public Licence
> + * as published by the Free Software Foundation; either version
> + * 2 of the Licence, or (at your option) any later version.

Again, SPDX headers please.

> --- /dev/null
> +++ b/include/uapi/linux/watch_queue.h
> @@ -0,0 +1,82 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */

Yeah!!!

No copyright?  :(

> +#ifndef _UAPI_LINUX_WATCH_QUEUE_H
> +#define _UAPI_LINUX_WATCH_QUEUE_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define IOC_WATCH_QUEUE_SET_SIZE	_IO('s', 0x01)	/* Set the size in pages */
> +#define IOC_WATCH_QUEUE_SET_FILTER	_IO('s', 0x02)	/* Set the filter */
> +
> +enum watch_notification_type {
> +	WATCH_TYPE_META		= 0,	/* Special record */
> +	WATCH_TYPE_MOUNT_NOTIFY	= 1,	/* Mount notification record */
> +	WATCH_TYPE_SB_NOTIFY	= 2,	/* Superblock notification */
> +	WATCH_TYPE_KEY_NOTIFY	= 3,	/* Key/keyring change notification */
> +	WATCH_TYPE_BLOCK_NOTIFY	= 4,	/* Block layer notifications */
> +#define WATCH_TYPE___NR 5
> +};
> +
> +enum watch_meta_notification_subtype {
> +	WATCH_META_SKIP_NOTIFICATION	= 0,	/* Just skip this record */
> +	WATCH_META_REMOVAL_NOTIFICATION	= 1,	/* Watched object was removed */
> +};
> +
> +/*
> + * Notification record
> + */
> +struct watch_notification {
> +	__u32			type:24;	/* enum watch_notification_type */
> +	__u32			subtype:8;	/* Type-specific subtype (filterable) */
> +	__u32			info;
> +#define WATCH_INFO_OVERRUN	0x00000001	/* Event(s) lost due to overrun */
> +#define WATCH_INFO_ENOMEM	0x00000002	/* Event(s) lost due to ENOMEM */
> +#define WATCH_INFO_RECURSIVE	0x00000004	/* Change was recursive */
> +#define WATCH_INFO_LENGTH	0x000001f8	/* Length of record / sizeof(watch_notification) */
> +#define WATCH_INFO_IN_SUBTREE	0x00000200	/* Change was not at watched root */
> +#define WATCH_INFO_TYPE_FLAGS	0x00ff0000	/* Type-specific flags */
> +#define WATCH_INFO_FLAG_0	0x00010000
> +#define WATCH_INFO_FLAG_1	0x00020000
> +#define WATCH_INFO_FLAG_2	0x00040000
> +#define WATCH_INFO_FLAG_3	0x00080000
> +#define WATCH_INFO_FLAG_4	0x00100000
> +#define WATCH_INFO_FLAG_5	0x00200000
> +#define WATCH_INFO_FLAG_6	0x00400000
> +#define WATCH_INFO_FLAG_7	0x00800000
> +#define WATCH_INFO_ID		0xff000000	/* ID of watchpoint */
> +};
> +
> +#define WATCH_LENGTH_SHIFT	3
> +
> +struct watch_queue_buffer {
> +	union {
> +		/* The first few entries are special, containing the
> +		 * ring management variables.
> +		 */
> +		struct {
> +			struct watch_notification watch; /* WATCH_TYPE_SKIP */
> +			volatile __u32	head;		/* Ring head index */
> +			volatile __u32	tail;		/* Ring tail index */

A uapi structure that has volatile in it?  Are you _SURE_ this is
correct?

That feels wrong to me...  This is not a backing-hardware register, it's
"just memory" and slapping volatile on it shouldn't be the correct
solution for telling the compiler to not to optimize away reads/flushes,
right?  You need a proper memory access type primitive for that to work
correctly everywhere I thought.

We only have 2 users of volatile in include/uapi, one for WMI structures
that are backed by firmware (seems correct), and one for DRM which I
have no idea how it works as it claims to be a lock.  Why is this new
addition the correct way to do this that no other ring-buffer that was
mmapped has needed to?

thanks,

greg k-h



More information about the Linux-security-module-archive mailing list