[PATCH v3 50/57] perf: Simplify sys_perf_event_open()

Mon Jun 12 09:08:03 UTC 2023

Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
---
 include/linux/file.h |    3 
 kernel/events/core.c |  483 +++++++++++++++++++++++----------------------------
 2 files changed, 222 insertions(+), 264 deletions(-)

--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -84,6 +84,7 @@ static inline void fdput_pos(struct fd f
 }
 
 DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
+DEFINE_FREE(fdput, struct fd, fdput(_T))
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
@@ -96,6 +97,8 @@ extern void put_unused_fd(unsigned int f
 DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
 	     get_unused_fd_flags(flags), unsigned flags)
 
+#define no_free_fd(fd) ({ int __fd = (fd); (fd) = -1; __fd; })
+
 extern void fd_install(unsigned int fd, struct file *file);
 
 extern int __receive_fd(struct file *file, int __user *ufd,
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1163,9 +1163,10 @@ static void perf_assert_pmu_disabled(str
 	WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
 }
 
-static void get_ctx(struct perf_event_context *ctx)
+static struct perf_event_context *get_ctx(struct perf_event_context *ctx)
 {
 	refcount_inc(&ctx->refcount);
+	return ctx;
 }
 
 static void *alloc_task_ctx_data(struct pmu *pmu)
@@ -4672,9 +4673,6 @@ find_lively_task_by_vpid(pid_t vpid)
 		get_task_struct(task);
 	rcu_read_unlock();
 
-	if (!task)
-		return ERR_PTR(-ESRCH);
-
 	return task;
 }
 
@@ -4754,6 +4752,11 @@ find_get_context(struct task_struct *tas
 	return ERR_PTR(err);
 }
 
+DEFINE_CLASS(find_get_ctx, struct perf_event_context *,
+	     if (!IS_ERR_OR_NULL(_T)) { perf_unpin_context(_T); put_ctx(_T); },
+	     find_get_context(task, event),
+	     struct task_struct *task, struct perf_event *event)
+
 /*
  * Returns a matching perf_event_pmu_context with elevated refcount or NULL.
  */
@@ -4836,9 +4839,10 @@ find_get_pmu_context(struct pmu *pmu, st
 	return epc;
 }
 
-static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+static struct perf_event_pmu_context *get_pmu_ctx(struct perf_event_pmu_context *epc)
 {
 	WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+	return epc;
 }
 
 static void free_epc_rcu(struct rcu_head *head)
@@ -4881,6 +4885,8 @@ static void put_pmu_ctx(struct perf_even
 	call_rcu(&epc->rcu_head, free_epc_rcu);
 }
 
+DEFINE_FREE(put_pmu_ctx, struct perf_event_pmu_context *, if (_T) put_pmu_ctx(_T))
+
 static void perf_event_free_filter(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
@@ -5190,6 +5196,8 @@ static void free_event(struct perf_event
 	_free_event(event);
 }
 
+DEFINE_FREE(free_event, struct perf_event *, if (!IS_ERR_OR_NULL(_T)) free_event(_T))
+
 /*
  * Remove user event from the owner task.
  */
@@ -5748,19 +5756,6 @@ EXPORT_SYMBOL_GPL(perf_event_period);
 
 static const struct file_operations perf_fops;
 
-static inline struct fd perf_fdget(int fd)
-{
-	struct fd f = fdget(fd);
-	if (!f.file)
-		return fdnull;
-
-	if (f.file->f_op != &perf_fops) {
-		fdput(f);
-		return fdnull;
-	}
-	return f;
-}
-
 static inline bool is_perf_fd(struct fd fd)
 {
 	return fd.file && fd.file->f_op == &perf_fops;
@@ -12189,19 +12184,16 @@ SYSCALL_DEFINE5(perf_event_open,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_event *group_leader = NULL, *output_event = NULL;
-	struct perf_event_pmu_context *pmu_ctx;
-	struct perf_event *event, *sibling;
+	struct perf_event *sibling;
 	struct perf_event_attr attr;
-	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
-	struct fd group = {NULL, 0};
-	struct task_struct *task = NULL;
+	struct task_struct *task __free(put_task) = NULL;
+	struct fd group __free(fdput) = fdnull;
 	struct pmu *pmu;
-	int event_fd;
 	int move_group = 0;
-	int err;
 	int f_flags = O_RDWR;
 	int cgroup_fd = -1;
+	int err;
 
 	/* for future expandability... */
 	if (flags & ~PERF_FLAG_ALL)
@@ -12261,16 +12253,14 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (flags & PERF_FLAG_FD_CLOEXEC)
 		f_flags |= O_CLOEXEC;
 
-	event_fd = get_unused_fd_flags(f_flags);
-	if (event_fd < 0)
-		return event_fd;
+	CLASS(get_unused_fd, fd)(f_flags);
+	if (fd < 0)
+		return fd;
 
 	if (group_fd != -1) {
-		group = perf_fdget(group_fd);
-		if (!group.file) {
-			err = -EBADF;
-			goto err_fd;
-		}
+		group = fdget(group_fd);
+		if (!is_perf_fd(group))
+			return -EBADF;
 		group_leader = group.file->private_data;
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
@@ -12280,33 +12270,26 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
 		task = find_lively_task_by_vpid(pid);
-		if (IS_ERR(task)) {
-			err = PTR_ERR(task);
-			goto err_group_fd;
-		}
+		if (!task)
+			return -ESRCH;
 	}
 
 	if (task && group_leader &&
-	    group_leader->attr.inherit != attr.inherit) {
-		err = -EINVAL;
-		goto err_task;
-	}
+	    group_leader->attr.inherit != attr.inherit)
+		return -EINVAL;
 
 	if (flags & PERF_FLAG_PID_CGROUP)
 		cgroup_fd = pid;
 
-	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+	struct perf_event *event __free(free_event) =
+		perf_event_alloc(&attr, cpu, task, group_leader, NULL,
 				 NULL, NULL, cgroup_fd);
-	if (IS_ERR(event)) {
-		err = PTR_ERR(event);
-		goto err_task;
-	}
+	if (IS_ERR(event))
+		return PTR_ERR(event);
 
 	if (is_sampling_event(event)) {
-		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-			err = -EOPNOTSUPP;
-			goto err_alloc;
-		}
+		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+			return -EOPNOTSUPP;
 	}
 
 	/*
@@ -12318,266 +12301,238 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (attr.use_clockid) {
 		err = perf_event_set_clock(event, attr.clockid);
 		if (err)
-			goto err_alloc;
+			return err;
 	}
 
 	if (pmu->task_ctx_nr == perf_sw_context)
 		event->event_caps |= PERF_EV_CAP_SOFTWARE;
 
-	if (task) {
-		err = down_read_interruptible(&task->signal->exec_update_lock);
-		if (err)
-			goto err_alloc;
+	do {
+		struct rw_semaphore *exec_update_lock __free(up_read) = NULL;
+		if (task) {
+			err = down_read_interruptible(&task->signal->exec_update_lock);
+			if (err)
+				return err;
+
+			exec_update_lock = &task->signal->exec_update_lock;
+
+			/*
+			 * We must hold exec_update_lock across this and any potential
+			 * perf_install_in_context() call for this new event to
+			 * serialize against exec() altering our credentials (and the
+			 * perf_event_exit_task() that could imply).
+			 */
+			if (!perf_check_permission(&attr, task))
+				return -EACCES;
+		}
 
 		/*
-		 * We must hold exec_update_lock across this and any potential
-		 * perf_install_in_context() call for this new event to
-		 * serialize against exec() altering our credentials (and the
-		 * perf_event_exit_task() that could imply).
+		 * Get the target context (task or percpu):
 		 */
-		err = -EACCES;
-		if (!perf_check_permission(&attr, task))
-			goto err_cred;
-	}
+		CLASS(find_get_ctx, ctx)(task, event);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
 
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(task, event);
-	if (IS_ERR(ctx)) {
-		err = PTR_ERR(ctx);
-		goto err_cred;
-	}
-
-	mutex_lock(&ctx->mutex);
+		guard(mutex)(&ctx->mutex);
 
-	if (ctx->task == TASK_TOMBSTONE) {
-		err = -ESRCH;
-		goto err_locked;
-	}
+		if (ctx->task == TASK_TOMBSTONE)
+			return -ESRCH;
 
-	if (!task) {
-		/*
-		 * Check if the @cpu we're creating an event for is online.
-		 *
-		 * We use the perf_cpu_context::ctx::mutex to serialize against
-		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
-		 */
-		struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+		if (!task) {
+			/*
+			 * Check if the @cpu we're creating an event for is
+			 * online.
+			 *
+			 * We use the perf_cpu_context::ctx::mutex to serialize
+			 * against the hotplug notifiers. See
+			 * perf_event_{init,exit}_cpu().
+			 */
+			struct perf_cpu_context *cpuctx =
+				per_cpu_ptr(&perf_cpu_context, event->cpu);
 
-		if (!cpuctx->online) {
-			err = -ENODEV;
-			goto err_locked;
+			if (!cpuctx->online)
+				return -ENODEV;
 		}
-	}
 
-	if (group_leader) {
-		err = -EINVAL;
+		if (group_leader) {
+			err = -EINVAL;
 
-		/*
-		 * Do not allow a recursive hierarchy (this new sibling
-		 * becoming part of another group-sibling):
-		 */
-		if (group_leader->group_leader != group_leader)
-			goto err_locked;
-
-		/* All events in a group should have the same clock */
-		if (group_leader->clock != event->clock)
-			goto err_locked;
+			/*
+			 * Do not allow a recursive hierarchy (this new sibling
+			 * becoming part of another group-sibling)
+			 */
+			if (group_leader->group_leader != group_leader)
+				return -EINVAL;
 
-		/*
-		 * Make sure we're both events for the same CPU;
-		 * grouping events for different CPUs is broken; since
-		 * you can never concurrently schedule them anyhow.
-		 */
-		if (group_leader->cpu != event->cpu)
-			goto err_locked;
+			/* All events in a group should have the same clock */
+			if (group_leader->clock != event->clock)
+				return -EINVAL;
 
-		/*
-		 * Make sure we're both on the same context; either task or cpu.
-		 */
-		if (group_leader->ctx != ctx)
-			goto err_locked;
+			/*
+			 * Make sure we're both events for the same CPU;
+			 * grouping events for different CPUs is broken; since
+			 * you can never concurrently schedule them anyhow.
+			 */
+			if (group_leader->cpu != event->cpu)
+				return -EINVAL;
 
-		/*
-		 * Only a group leader can be exclusive or pinned
-		 */
-		if (attr.exclusive || attr.pinned)
-			goto err_locked;
+			/*
+			 * Make sure we're both on the same context; either
+			 * task or cpu.
+			 */
+			if (group_leader->ctx != ctx)
+				return -EINVAL;
 
-		if (is_software_event(event) &&
-		    !in_software_context(group_leader)) {
 			/*
-			 * If the event is a sw event, but the group_leader
-			 * is on hw context.
-			 *
-			 * Allow the addition of software events to hw
-			 * groups, this is safe because software events
-			 * never fail to schedule.
-			 *
-			 * Note the comment that goes with struct
-			 * perf_event_pmu_context.
+			 * Only a group leader can be exclusive or pinned
 			 */
-			pmu = group_leader->pmu_ctx->pmu;
-		} else if (!is_software_event(event)) {
-			if (is_software_event(group_leader) &&
-			    (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+			if (attr.exclusive || attr.pinned)
+				return -EINVAL;
+
+			if (is_software_event(event) &&
+			    !in_software_context(group_leader)) {
+				/*
+				 * If the event is a sw event, but the
+				 * group_leader is on hw context.
+				 *
+				 * Allow the addition of software events to hw
+				 * groups, this is safe because software events
+				 * never fail to schedule.
+				 *
+				 * Note the comment that goes with struct
+				 * perf_event_pmu_context.
+				 */
+				pmu = group_leader->pmu_ctx->pmu;
+			} else if (!is_software_event(event)) {
+				if (is_software_event(group_leader) &&
+				    (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+					/*
+					 * In case the group is a pure software
+					 * group, and we try to add a hardware
+					 * event, move the whole group to the
+					 * hardware context.
+					 */
+					move_group = 1;
+				}
+
 				/*
-				 * In case the group is a pure software group, and we
-				 * try to add a hardware event, move the whole group to
-				 * the hardware context.
+				 * Don't allow group of multiple hw events from
+				 * different pmus
 				 */
-				move_group = 1;
+				if (!in_software_context(group_leader) &&
+				    group_leader->pmu_ctx->pmu != pmu)
+					return -EINVAL;
 			}
+		}
+
+		/*
+		 * Now that we're certain of the pmu; find the pmu_ctx.
+		 */
+		struct perf_event_pmu_context *pmu_ctx __free(put_pmu_ctx) =
+			find_get_pmu_context(pmu, ctx, event);
+		if (!pmu_ctx)
+			return -ENOMEM;
 
-			/* Don't allow group of multiple hw events from different pmus */
-			if (!in_software_context(group_leader) &&
-			    group_leader->pmu_ctx->pmu != pmu)
-				goto err_locked;
+		if (output_event) {
+			err = perf_event_set_output(event, output_event);
+			if (err)
+				return err;
 		}
-	}
 
-	/*
-	 * Now that we're certain of the pmu; find the pmu_ctx.
-	 */
-	pmu_ctx = find_get_pmu_context(pmu, ctx, event);
-	if (IS_ERR(pmu_ctx)) {
-		err = PTR_ERR(pmu_ctx);
-		goto err_locked;
-	}
-	event->pmu_ctx = pmu_ctx;
+		if (!perf_event_validate_size(event))
+			return -E2BIG;
 
-	if (output_event) {
-		err = perf_event_set_output(event, output_event);
-		if (err)
-			goto err_context;
-	}
+		if (perf_need_aux_event(event) &&
+		    !perf_get_aux_event(event, group_leader))
+			return -EINVAL;
 
-	if (!perf_event_validate_size(event)) {
-		err = -E2BIG;
-		goto err_context;
-	}
+		/*
+		 * Must be under the same ctx::mutex as perf_install_in_context(),
+		 * because we need to serialize with concurrent event creation.
+		 */
+		if (!exclusive_event_installable(event, ctx))
+			return -EBUSY;
 
-	if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
-		err = -EINVAL;
-		goto err_context;
-	}
+		WARN_ON_ONCE(ctx->parent_ctx);
 
-	/*
-	 * Must be under the same ctx::mutex as perf_install_in_context(),
-	 * because we need to serialize with concurrent event creation.
-	 */
-	if (!exclusive_event_installable(event, ctx)) {
-		err = -EBUSY;
-		goto err_context;
-	}
+		event_file = anon_inode_getfile("[perf_event]", &perf_fops,
+						event, f_flags);
+		if (IS_ERR(event_file))
+			return PTR_ERR(event_file);
 
-	WARN_ON_ONCE(ctx->parent_ctx);
+		/*
+		 * The event is now owned by event_file and will be cleaned up
+		 * through perf_fops::release(). Similarly the fd will be linked
+		 * to event_file and should not be put_unused_fd().
+		 */
 
-	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
-	if (IS_ERR(event_file)) {
-		err = PTR_ERR(event_file);
-		event_file = NULL;
-		goto err_context;
-	}
+		/*
+		 * This is the point on no return; we cannot fail hereafter. This is
+		 * where we start modifying current state.
+		 */
 
-	/*
-	 * This is the point on no return; we cannot fail hereafter. This is
-	 * where we start modifying current state.
-	 */
+		if (move_group) {
+			/*
+			 * Moves the events from one pmu to another, hence we need
+			 * to update the pmu_ctx, but through all this the ctx
+			 * stays the same.
+			 */
+			perf_remove_from_context(group_leader, 0);
+			put_pmu_ctx(group_leader->pmu_ctx);
 
-	if (move_group) {
-		perf_remove_from_context(group_leader, 0);
-		put_pmu_ctx(group_leader->pmu_ctx);
+			for_each_sibling_event(sibling, group_leader) {
+				perf_remove_from_context(sibling, 0);
+				put_pmu_ctx(sibling->pmu_ctx);
+			}
 
-		for_each_sibling_event(sibling, group_leader) {
-			perf_remove_from_context(sibling, 0);
-			put_pmu_ctx(sibling->pmu_ctx);
-		}
+			/*
+			 * Install the group siblings before the group leader.
+			 *
+			 * Because a group leader will try and install the entire group
+			 * (through the sibling list, which is still in-tact), we can
+			 * end up with siblings installed in the wrong context.
+			 *
+			 * By installing siblings first we NO-OP because they're not
+			 * reachable through the group lists.
+			 */
+			for_each_sibling_event(sibling, group_leader) {
+				sibling->pmu_ctx = get_pmu_ctx(pmu_ctx);
+				perf_event__state_init(sibling);
+				perf_install_in_context(ctx, sibling, sibling->cpu);
+			}
 
-		/*
-		 * Install the group siblings before the group leader.
-		 *
-		 * Because a group leader will try and install the entire group
-		 * (through the sibling list, which is still in-tact), we can
-		 * end up with siblings installed in the wrong context.
-		 *
-		 * By installing siblings first we NO-OP because they're not
-		 * reachable through the group lists.
-		 */
-		for_each_sibling_event(sibling, group_leader) {
-			sibling->pmu_ctx = pmu_ctx;
-			get_pmu_ctx(pmu_ctx);
-			perf_event__state_init(sibling);
-			perf_install_in_context(ctx, sibling, sibling->cpu);
+			/*
+			 * Removing from the context ends up with disabled
+			 * event. What we want here is event in the initial
+			 * startup state, ready to be add into new context.
+			 */
+			group_leader->pmu_ctx = get_pmu_ctx(pmu_ctx);
+			perf_event__state_init(group_leader);
+			perf_install_in_context(ctx, group_leader, group_leader->cpu);
 		}
 
 		/*
-		 * Removing from the context ends up with disabled
-		 * event. What we want here is event in the initial
-		 * startup state, ready to be add into new context.
+		 * Precalculate sample_data sizes; do while holding ctx::mutex such
+		 * that we're serialized against further additions and before
+		 * perf_install_in_context() which is the point the event is active and
+		 * can use these values.
 		 */
-		group_leader->pmu_ctx = pmu_ctx;
-		get_pmu_ctx(pmu_ctx);
-		perf_event__state_init(group_leader);
-		perf_install_in_context(ctx, group_leader, group_leader->cpu);
-	}
+		perf_event__header_size(event);
+		perf_event__id_header_size(event);
 
-	/*
-	 * Precalculate sample_data sizes; do while holding ctx::mutex such
-	 * that we're serialized against further additions and before
-	 * perf_install_in_context() which is the point the event is active and
-	 * can use these values.
-	 */
-	perf_event__header_size(event);
-	perf_event__id_header_size(event);
+		event->owner = current;
 
-	event->owner = current;
+		event->pmu_ctx = no_free_ptr(pmu_ctx);
+		perf_install_in_context(get_ctx(ctx), event, event->cpu);
+	} while (0);
 
-	perf_install_in_context(ctx, event, event->cpu);
-	perf_unpin_context(ctx);
+	scoped_guard (mutex, &current->perf_event_mutex)
+		list_add_tail(&event->owner_entry, &current->perf_event_list);
 
-	mutex_unlock(&ctx->mutex);
+	fd_install(fd, event_file);
 
-	if (task) {
-		up_read(&task->signal->exec_update_lock);
-		put_task_struct(task);
-	}
-
-	mutex_lock(&current->perf_event_mutex);
-	list_add_tail(&event->owner_entry, &current->perf_event_list);
-	mutex_unlock(&current->perf_event_mutex);
-
-	/*
-	 * Drop the reference on the group_event after placing the
-	 * new event on the sibling_list. This ensures destruction
-	 * of the group leader will find the pointer to itself in
-	 * perf_group_detach().
-	 */
-	fdput(group);
-	fd_install(event_fd, event_file);
-	return event_fd;
-
-err_context:
-	put_pmu_ctx(event->pmu_ctx);
-	event->pmu_ctx = NULL; /* _free_event() */
-err_locked:
-	mutex_unlock(&ctx->mutex);
-	perf_unpin_context(ctx);
-	put_ctx(ctx);
-err_cred:
-	if (task)
-		up_read(&task->signal->exec_update_lock);
-err_alloc:
-	free_event(event);
-err_task:
-	if (task)
-		put_task_struct(task);
-err_group_fd:
-	fdput(group);
-err_fd:
-	put_unused_fd(event_fd);
-	return err;
+	no_free_ptr(event);
+	return no_free_fd(fd);
 }
 
 /**