[PATCH v3 1/3] landlock: Multithreading support for landlock_restrict_self()
Günther Noack
gnoack at google.com
Thu Nov 27 11:51:34 UTC 2025
Introduce the LANDLOCK_RESTRICT_SELF_TSYNC flag. With this flag, a
given Landlock ruleset is applied to all threads of the calling
process, instead of only the current one.
Without this flag, multithreaded userspace programs currently resort
to using the nptl(7)/libpsx hack for multithreaded policy enforcement,
which is also used by libcap and for setuid(2). Using this
userspace-based scheme, the threads of a process enforce the same
Landlock policy, but the resulting Landlock domains are still
separate. The domains being separate causes multiple problems:
* When using Landlock's "scoped" access rights, the domain identity is
used to determine whether an operation is permitted. As a result,
when using LANLDOCK_SCOPE_SIGNAL, signaling between sibling threads
stops working. This is a problem for programming languages and
frameworks which are inherently multithreaded (e.g. Go).
* In audit logging, the domains of separate threads in a process will
get logged with different domain IDs, even when they are based on
the same ruleset FD, which migth confuse users.
Cc: Andrew G. Morgan <morgan at kernel.org>
Cc: John Johansen <john.johansen at canonical.com>
Cc: Mickaël Salaün <mic at digikod.net>
Cc: Paul Moore <paul at paul-moore.com>
Cc: linux-security-module at vger.kernel.org
Suggested-by: Jann Horn <jannh at google.com>
Signed-off-by: Günther Noack <gnoack at google.com>
---
include/uapi/linux/landlock.h | 13 +
security/landlock/Makefile | 2 +-
security/landlock/cred.h | 12 +
security/landlock/limits.h | 2 +-
security/landlock/syscalls.c | 66 ++-
security/landlock/tsync.c | 555 +++++++++++++++++++
security/landlock/tsync.h | 16 +
tools/testing/selftests/landlock/base_test.c | 2 +-
8 files changed, 638 insertions(+), 30 deletions(-)
create mode 100644 security/landlock/tsync.c
create mode 100644 security/landlock/tsync.h
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index f030adc462ee..a60793767c4b 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -117,11 +117,24 @@ struct landlock_ruleset_attr {
* future nested domains, not the one being created. It can also be used
* with a @ruleset_fd value of -1 to mute subdomain logs without creating a
* domain.
+ *
+ * The following flag supports policy enforcement in multithreaded processes:
+ *
+ * %LANDLOCK_RESTRICT_SELF_TSYNC
+ * Applies the new Landlock configuration atomically to all threads of the
+ * current process, including the Landlock domain and logging
+ * configuration. This overrides the Landlock configuration of sibling
+ * threads, irrespective of previously established Landlock domains and
+ * logging configurations on these threads.
+ *
+ * If the calling thread is running with no_new_privs, this operation
+ * enables no_new_privs on the sibling threads as well.
*/
/* clang-format off */
#define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF (1U << 0)
#define LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON (1U << 1)
#define LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF (1U << 2)
+#define LANDLOCK_RESTRICT_SELF_TSYNC (1U << 3)
/* clang-format on */
/**
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 3160c2bdac1d..74122f814cf6 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
-landlock-y := setup.o syscalls.o object.o ruleset.o \
+landlock-y := setup.o syscalls.o tsync.o object.o ruleset.o \
cred.o task.o fs.o
landlock-$(CONFIG_INET) += net.o
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
index c82fe63ec598..c10a06727eb1 100644
--- a/security/landlock/cred.h
+++ b/security/landlock/cred.h
@@ -26,6 +26,8 @@
* This structure is packed to minimize the size of struct
* landlock_file_security. However, it is always aligned in the LSM cred blob,
* see lsm_set_blob_size().
+ *
+ * When updating this, also update landlock_cred_copy() if needed.
*/
struct landlock_cred_security {
/**
@@ -65,6 +67,16 @@ landlock_cred(const struct cred *cred)
return cred->security + landlock_blob_sizes.lbs_cred;
}
+static inline void landlock_cred_copy(struct landlock_cred_security *dst,
+ const struct landlock_cred_security *src)
+{
+ landlock_put_ruleset(dst->domain);
+
+ *dst = *src;
+
+ landlock_get_ruleset(src->domain);
+}
+
static inline struct landlock_ruleset *landlock_get_current_domain(void)
{
return landlock_cred(current_cred())->domain;
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 65b5ff051674..eb584f47288d 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -31,7 +31,7 @@
#define LANDLOCK_MASK_SCOPE ((LANDLOCK_LAST_SCOPE << 1) - 1)
#define LANDLOCK_NUM_SCOPE __const_hweight64(LANDLOCK_MASK_SCOPE)
-#define LANDLOCK_LAST_RESTRICT_SELF LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+#define LANDLOCK_LAST_RESTRICT_SELF LANDLOCK_RESTRICT_SELF_TSYNC
#define LANDLOCK_MASK_RESTRICT_SELF ((LANDLOCK_LAST_RESTRICT_SELF << 1) - 1)
/* clang-format on */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 0116e9f93ffe..22b6200283f3 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -36,6 +36,7 @@
#include "net.h"
#include "ruleset.h"
#include "setup.h"
+#include "tsync.h"
static bool is_initialized(void)
{
@@ -161,7 +162,7 @@ static const struct file_operations ruleset_fops = {
* Documentation/userspace-api/landlock.rst should be updated to reflect the
* UAPI change.
*/
-const int landlock_abi_version = 7;
+const int landlock_abi_version = 8;
/**
* sys_landlock_create_ruleset - Create a new ruleset
@@ -454,9 +455,10 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
* - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF
* - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON
* - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+ * - %LANDLOCK_RESTRICT_SELF_TSYNC
*
- * This system call enables to enforce a Landlock ruleset on the current
- * thread. Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
+ * This system call enforces a Landlock ruleset on the current thread.
+ * Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
* namespace or is running with no_new_privs. This avoids scenarios where
* unprivileged tasks can affect the behavior of privileged children.
*
@@ -484,6 +486,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
struct landlock_cred_security *new_llcred;
bool __maybe_unused log_same_exec, log_new_exec, log_subdomains,
prev_log_subdomains;
+ int err;
if (!is_initialized())
return -EOPNOTSUPP;
@@ -538,33 +541,42 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
* We could optimize this case by not calling commit_creds() if this flag
* was already set, but it is not worth the complexity.
*/
- if (!ruleset)
- return commit_creds(new_cred);
+ if (ruleset) {
+ /*
+ * There is no possible race condition while copying and
+ * manipulating the current credentials because they are
+ * dedicated per thread.
+ */
+ new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
+ if (IS_ERR(new_dom)) {
+ abort_creds(new_cred);
+ return PTR_ERR(new_dom);
+ }
- /*
- * There is no possible race condition while copying and manipulating
- * the current credentials because they are dedicated per thread.
- */
- new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
- if (IS_ERR(new_dom)) {
- abort_creds(new_cred);
- return PTR_ERR(new_dom);
+#ifdef CONFIG_AUDIT
+ new_dom->hierarchy->log_same_exec = log_same_exec;
+ new_dom->hierarchy->log_new_exec = log_new_exec;
+ if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
+ new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
+#endif /* CONFIG_AUDIT */
+
+ /* Replaces the old (prepared) domain. */
+ landlock_put_ruleset(new_llcred->domain);
+ new_llcred->domain = new_dom;
+
+#ifdef CONFIG_AUDIT
+ new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
+#endif /* CONFIG_AUDIT */
}
-#ifdef CONFIG_AUDIT
- new_dom->hierarchy->log_same_exec = log_same_exec;
- new_dom->hierarchy->log_new_exec = log_new_exec;
- if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
- new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
-#endif /* CONFIG_AUDIT */
-
- /* Replaces the old (prepared) domain. */
- landlock_put_ruleset(new_llcred->domain);
- new_llcred->domain = new_dom;
-
-#ifdef CONFIG_AUDIT
- new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
-#endif /* CONFIG_AUDIT */
+ if (flags & LANDLOCK_RESTRICT_SELF_TSYNC) {
+ err = landlock_restrict_sibling_threads(current_cred(),
+ new_cred);
+ if (err != 0) {
+ abort_creds(new_cred);
+ return err;
+ }
+ }
return commit_creds(new_cred);
}
diff --git a/security/landlock/tsync.c b/security/landlock/tsync.c
new file mode 100644
index 000000000000..a8db75259a8c
--- /dev/null
+++ b/security/landlock/tsync.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - Cross-thread ruleset enforcement
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <linux/atomic.h>
+#include <linux/completion.h>
+#include <linux/task_work.h>
+
+#include "cred.h"
+#include "tsync.h"
+
+/*
+ * Shared state between multiple threads which are enforcing Landlock rulesets
+ * in lockstep with each other.
+ */
+struct tsync_shared_context {
+ /* The old and tentative new creds of the calling thread. */
+ const struct cred *old_cred;
+ const struct cred *new_cred;
+
+ /* True if sibling tasks need to set the no_new_privs flag. */
+ bool set_no_new_privs;
+
+ /* An error encountered in preparation step, or 0. */
+ atomic_t preparation_error;
+
+ /*
+ * Barrier after preparation step in restrict_one_thread.
+ * The calling thread waits for completion.
+ *
+ * Re-initialized on every round of looking for newly spawned threads.
+ */
+ atomic_t num_preparing;
+ struct completion all_prepared;
+
+ /* Sibling threads wait for completion. */
+ struct completion ready_to_commit;
+
+ /*
+ * Barrier after commit step (used by syscall impl to wait for
+ * completion).
+ */
+ atomic_t num_unfinished;
+ struct completion all_finished;
+};
+
+struct tsync_work {
+ struct callback_head work;
+ struct task_struct *task;
+ struct tsync_shared_context *shared_ctx;
+};
+
+/*
+ * restrict_one_thread - update a thread's Landlock domain in lockstep with the
+ * other threads in the same process
+ *
+ * When this is run, the same function gets run in all other threads in the same
+ * process (except for the calling thread which called landlock_restrict_self).
+ * The concurrently running invocations of restrict_one_thread coordinate
+ * through the shared ctx object to do their work in lockstep to implement
+ * all-or-nothing semantics for enforcing the new Landlock domain.
+ *
+ * Afterwards, depending on the presence of an error, all threads either commit
+ * or abort the prepared credentials. The commit operation can not fail any more.
+ */
+static void restrict_one_thread(struct tsync_shared_context *ctx)
+{
+ int err;
+ struct cred *cred = NULL;
+
+ if (current_cred() == ctx->old_cred) {
+ /*
+ * Switch out old_cred with new_cred, if possible.
+ *
+ * In the common case, where all threads initially point to the
+ * same struct cred, this optimization avoids creating separate
+ * redundant credentials objects for each, which would all have
+ * the same contents.
+ *
+ * Note: We are intentionally dropping the const qualifier here,
+ * because it is required by commit_creds() and abort_creds().
+ */
+ cred = (struct cred *)get_cred(ctx->new_cred);
+ } else {
+ /* Else, prepare new creds and populate them. */
+ cred = prepare_creds();
+
+ if (!cred) {
+ atomic_set(&ctx->preparation_error, -ENOMEM);
+
+ /*
+ * Even on error, we need to adhere to the protocol and
+ * coordinate with concurrently running invocations.
+ */
+ if (atomic_dec_return(&ctx->num_preparing) == 0)
+ complete_all(&ctx->all_prepared);
+
+ goto out;
+ }
+
+ landlock_cred_copy(landlock_cred(cred),
+ landlock_cred(ctx->new_cred));
+ }
+
+ /*
+ * Barrier: Wait until all threads are done preparing.
+ * After this point, we can have no more failures.
+ */
+ if (atomic_dec_return(&ctx->num_preparing) == 0)
+ complete_all(&ctx->all_prepared);
+
+ /*
+ * Wait for signal from calling thread that it's safe to read the
+ * preparation error now and we are ready to commit (or abort).
+ */
+ wait_for_completion(&ctx->ready_to_commit);
+
+ /* Abort the commit if any of the other threads had an error. */
+ err = atomic_read(&ctx->preparation_error);
+ if (err) {
+ abort_creds(cred);
+ goto out;
+ }
+
+ /*
+ * Make sure that all sibling tasks fulfill the no_new_privs
+ * prerequisite. (This is in line with Seccomp's
+ * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c.)
+ */
+ if (ctx->set_no_new_privs)
+ task_set_no_new_privs(current);
+
+ commit_creds(cred);
+
+out:
+ /* Notify the calling thread once all threads are done */
+ if (atomic_dec_return(&ctx->num_unfinished) == 0)
+ complete_all(&ctx->all_finished);
+}
+
+/*
+ * restrict_one_thread_callback - task_work callback for restricting a thread
+ *
+ * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
+ */
+static void restrict_one_thread_callback(struct callback_head *work)
+{
+ struct tsync_work *ctx = container_of(work, struct tsync_work, work);
+
+ restrict_one_thread(ctx->shared_ctx);
+}
+
+/*
+ * struct tsync_works - a growable array of per-task contexts
+ *
+ * The zero-initialized struct represents the empty array.
+ */
+struct tsync_works {
+ struct tsync_work **works;
+ size_t size;
+ size_t capacity;
+};
+
+/*
+ * tsync_works_provide - provides a preallocated tsync_work for the given task
+ *
+ * This also stores a task pointer in the context and increments the reference
+ * count of the task.
+ *
+ * This function may fail in the case where we did not preallocate sufficient
+ * capacity. This can legitimately happen if new threads get started after we
+ * grew the capacity.
+ *
+ * Returns:
+ * A pointer to the preallocated context struct, with task filled in.
+ *
+ * NULL, if we ran out of preallocated context structs.
+ */
+static struct tsync_work *tsync_works_provide(struct tsync_works *s,
+ struct task_struct *task)
+{
+ struct tsync_work *ctx;
+
+ if (s->size >= s->capacity)
+ return NULL;
+
+ ctx = s->works[s->size];
+ s->size++;
+
+ ctx->task = get_task_struct(task);
+ return ctx;
+}
+
+/*
+ * tsync_works_grow_by - preallocates space for n more contexts in s
+ *
+ * On a successful return, the subsequent n calls to tsync_works_provide() are
+ * guaranteed to succeed. (size + n <= capacity)
+ *
+ * Returns:
+ * -ENOMEM if the (re)allocation fails
+ * 0 if the allocation succeeds, partially succeeds, or no reallocation was needed
+ */
+static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
+{
+ size_t i;
+ size_t new_capacity;
+ struct tsync_work **works;
+ struct tsync_work *work;
+
+ if (check_add_overflow(s->size, n, &new_capacity))
+ return -EOVERFLOW;
+
+ /* No need to reallocate if s already has sufficient capacity. */
+ if (new_capacity <= s->capacity)
+ return 0;
+
+ works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
+ flags);
+ if (!works)
+ return -ENOMEM;
+
+ s->works = works;
+
+ for (i = s->capacity; i < new_capacity; i++) {
+ work = kzalloc(sizeof(*work), flags);
+ if (!work) {
+ /*
+ * Leave the object in a consistent state,
+ * but return an error.
+ */
+ s->capacity = i;
+ return -ENOMEM;
+ }
+ s->works[i] = work;
+ }
+ s->capacity = new_capacity;
+ return 0;
+}
+
+/*
+ * tsync_works_contains - checks for presence of task in s
+ */
+static bool tsync_works_contains_task(const struct tsync_works *s,
+ struct task_struct *task)
+{
+ size_t i;
+
+ for (i = 0; i < s->size; i++)
+ if (s->works[i]->task == task)
+ return true;
+ return false;
+}
+
+/*
+ * tsync_works_release - frees memory held by s and drops all task references
+ *
+ * This does not free s itself, only the data structures held by it.
+ */
+static void tsync_works_release(struct tsync_works *s)
+{
+ size_t i;
+
+ for (i = 0; i < s->size; i++) {
+ if (!s->works[i]->task)
+ continue;
+
+ put_task_struct(s->works[i]->task);
+ }
+
+ for (i = 0; i < s->capacity; i++)
+ kfree(s->works[i]);
+ kfree(s->works);
+ s->works = NULL;
+ s->size = 0;
+ s->capacity = 0;
+}
+
+/*
+ * count_additional_threads - counts the sibling threads that are not in works
+ */
+static size_t count_additional_threads(const struct tsync_works *works)
+{
+ struct task_struct *thread, *caller;
+ size_t n = 0;
+
+ caller = current;
+
+ guard(rcu)();
+
+ for_each_thread(caller, thread) {
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ /* Skip exited threads. */
+ if (thread->flags & PF_EXITING)
+ continue;
+
+ /* Skip threads that we have already seen. */
+ if (tsync_works_contains_task(works, thread))
+ continue;
+
+ n++;
+ }
+ return n;
+}
+
+/*
+ * schedule_task_work - adds task_work for all eligible sibling threads
+ * which have not been scheduled yet
+ *
+ * For each added task_work, atomically increments shared_ctx->num_preparing and
+ * shared_ctx->num_unfinished.
+ *
+ * Returns:
+ * true, if at least one eligible sibling thread was found
+ */
+static bool schedule_task_work(struct tsync_works *works,
+ struct tsync_shared_context *shared_ctx)
+{
+ int err;
+ struct task_struct *thread, *caller;
+ struct tsync_work *ctx;
+ bool found_more_threads = false;
+
+ caller = current;
+
+ guard(rcu)();
+
+ for_each_thread(caller, thread) {
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ /* Skip exited threads. */
+ if (thread->flags & PF_EXITING)
+ continue;
+
+ /* Skip threads that we already looked at. */
+ if (tsync_works_contains_task(works, thread))
+ continue;
+
+ /*
+ * We found a sibling thread that is not doing its task_work
+ * yet, and which might spawn new threads before our task work
+ * runs, so we need at least one more round in the outer loop.
+ */
+ found_more_threads = true;
+
+ ctx = tsync_works_provide(works, thread);
+ if (!ctx) {
+ /*
+ * We ran out of preallocated contexts -- we need to try
+ * again with this thread at a later time!
+ * found_more_threads is already true at this point.
+ */
+ break;
+ }
+
+ ctx->shared_ctx = shared_ctx;
+
+ atomic_inc(&shared_ctx->num_preparing);
+ atomic_inc(&shared_ctx->num_unfinished);
+
+ init_task_work(&ctx->work, restrict_one_thread_callback);
+ err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
+ if (err) {
+ /*
+ * task_work_add() only fails if the task is about to
+ * exit. We checked that earlier, but it can happen as
+ * a race. Resume without setting an error, as the task
+ * is probably gone in the next loop iteration. For
+ * consistency, remove the task from ctx so that it does
+ * not look like we handed it a task_work.
+ */
+ put_task_struct(ctx->task);
+ ctx->task = NULL;
+
+ atomic_dec(&shared_ctx->num_preparing);
+ atomic_dec(&shared_ctx->num_unfinished);
+ }
+ }
+
+ return found_more_threads;
+}
+
+/*
+ * cancel_tsync_works - cancel all task works where it is possible
+ *
+ * Task works can be canceled as long as they are still queued and have not
+ * started running. If they get canceled, we decrement
+ * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
+ * completions if needed, as if the task was never scheduled.
+ */
+static void cancel_tsync_works(struct tsync_works *works,
+ struct tsync_shared_context *shared_ctx)
+{
+ int i;
+
+ for (i = 0; i < works->size; i++) {
+ if (!task_work_cancel(works->works[i]->task,
+ &works->works[i]->work))
+ continue;
+
+ /* After dequeueing, act as if the task work had executed. */
+
+ if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
+ complete_all(&shared_ctx->all_prepared);
+
+ if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
+ complete_all(&shared_ctx->all_finished);
+ }
+}
+
+/*
+ * restrict_sibling_threads - enables a Landlock policy for all sibling threads
+ */
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+ const struct cred *new_cred)
+{
+ int err;
+ struct tsync_shared_context shared_ctx;
+ struct tsync_works works = {};
+ size_t newly_discovered_threads;
+ bool found_more_threads;
+
+ atomic_set(&shared_ctx.preparation_error, 0);
+ init_completion(&shared_ctx.all_prepared);
+ init_completion(&shared_ctx.ready_to_commit);
+ atomic_set(&shared_ctx.num_unfinished, 1);
+ init_completion(&shared_ctx.all_finished);
+ shared_ctx.old_cred = old_cred;
+ shared_ctx.new_cred = new_cred;
+ shared_ctx.set_no_new_privs = task_no_new_privs(current);
+
+ /*
+ * We schedule a pseudo-signal task_work for each of the calling task's
+ * sibling threads. In the task work, each thread:
+ *
+ * 1) runs prepare_creds() and writes back the error to
+ * shared_ctx.preparation_error, if needed.
+ *
+ * 2) signals that it's done with prepare_creds() to the calling task.
+ * (completion "all_prepared").
+ *
+ * 3) waits for the completion "ready_to_commit". This is sent by the
+ * calling task after ensuring that all sibling threads have done
+ * with the "preparation" stage.
+ *
+ * After this barrier is reached, it's safe to read
+ * shared_ctx.preparation_error.
+ *
+ * 4) reads shared_ctx.preparation_error and then either does
+ * commit_creds() or abort_creds().
+ *
+ * 5) signals that it's done altogether (barrier synchronization
+ * "all_finished")
+ *
+ * Unlike seccomp, which modifies sibling tasks directly, we do not need
+ * to acquire the cred_guard_mutex and sighand->siglock:
+ *
+ * * As in our case, all threads are themselves exchanging their own
+ * struct cred through the credentials API, no locks are needed for
+ * that.
+ * * Our for_each_thread() loops are protected by RCU.
+ * * We do not acquire a lock to keep the list of sibling threads stable
+ * between our for_each_thread loops. If the list of available
+ * sibling threads changes between these for_each_thread loops, we
+ * make up for that by continuing to look for threads until they are
+ * all discovered and have entered their task_work, where they are
+ * unable to spawn new threads.
+ */
+ do {
+ /* In RCU read-lock, count the threads we need. */
+ newly_discovered_threads = count_additional_threads(&works);
+
+ if (newly_discovered_threads == 0)
+ break; /* done */
+
+ err = tsync_works_grow_by(&works, newly_discovered_threads,
+ GFP_KERNEL_ACCOUNT);
+ if (err) {
+ atomic_set(&shared_ctx.preparation_error, err);
+ break;
+ }
+
+ /*
+ * The "all_prepared" barrier is used locally to the loop body,
+ * this use of for_each_thread(). We can reset it on each loop
+ * iteration because all previous loop iterations are done with
+ * it already.
+ *
+ * num_preparing is initialized to 1 so that the counter can not
+ * go to 0 and mark the completion as done before all task works
+ * are registered. We decrement it at the end of the loop body.
+ */
+ atomic_set(&shared_ctx.num_preparing, 1);
+ reinit_completion(&shared_ctx.all_prepared);
+
+ /* In RCU read-lock, schedule task work on newly discovered sibling tasks. */
+ found_more_threads = schedule_task_work(&works, &shared_ctx);
+
+ /*
+ * Decrement num_preparing for current, to undo that we
+ * initialized it to 1 a few lines above.
+ */
+ if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
+ if (wait_for_completion_interruptible(
+ &shared_ctx.all_prepared)) {
+ /*
+ * In case of interruption, we need to retry the
+ * system call.
+ */
+ atomic_set(&shared_ctx.preparation_error,
+ -ERESTARTNOINTR);
+
+ /*
+ * Cancel task works for tasks that did not
+ * start running yet, and decrement all_prepared
+ * and num_unfinished accordingly.
+ */
+ cancel_tsync_works(&works, &shared_ctx);
+
+ /*
+ * The remaining task works have started
+ * running, so waiting for their completion will
+ * finish.
+ */
+ wait_for_completion(&shared_ctx.all_prepared);
+ }
+ }
+ } while (found_more_threads &&
+ !atomic_read(&shared_ctx.preparation_error));
+
+ /*
+ * We now have all sibling threads blocking and in "prepared" state in
+ * the task work. Ask all threads to commit.
+ */
+ complete_all(&shared_ctx.ready_to_commit);
+
+ /*
+ * Decrement num_unfinished for current, to undo that we initialized it
+ * to 1 at the beginning.
+ */
+ if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
+ wait_for_completion(&shared_ctx.all_finished);
+
+ tsync_works_release(&works);
+
+ return atomic_read(&shared_ctx.preparation_error);
+}
diff --git a/security/landlock/tsync.h b/security/landlock/tsync.h
new file mode 100644
index 000000000000..b85586db8b51
--- /dev/null
+++ b/security/landlock/tsync.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock LSM - Cross-thread ruleset enforcement
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#ifndef _SECURITY_LANDLOCK_TSYNC_H
+#define _SECURITY_LANDLOCK_TSYNC_H
+
+#include "cred.h"
+
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+ const struct cred *new_cred);
+
+#endif /* _SECURITY_LANDLOCK_TSYNC_H */
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 7b69002239d7..f4b1a275d8d9 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -76,7 +76,7 @@ TEST(abi_version)
const struct landlock_ruleset_attr ruleset_attr = {
.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
};
- ASSERT_EQ(7, landlock_create_ruleset(NULL, 0,
+ ASSERT_EQ(8, landlock_create_ruleset(NULL, 0,
LANDLOCK_CREATE_RULESET_VERSION));
ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
--
2.52.0.177.g9f829587af-goog
More information about the Linux-security-module-archive
mailing list