blob: 0d2b9c6460304102afa019a20b1888bf418e2a57 [file] [log] [blame] [edit]
// SPDX-License-Identifier: GPL-2.0-only
/*
* Landlock - Cross-thread ruleset enforcement
*
* Copyright © 2025 Google LLC
*/
#include <linux/atomic.h>
#include <linux/cleanup.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/overflow.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/task_work.h>
#include "cred.h"
#include "tsync.h"
/*
* Shared state between multiple threads which are enforcing Landlock rulesets
* in lockstep with each other.
*/
struct tsync_shared_context {
/* The old and tentative new creds of the calling thread. */
const struct cred *old_cred;
const struct cred *new_cred;
/* True if sibling tasks need to set the no_new_privs flag. */
bool set_no_new_privs;
/* An error encountered in preparation step, or 0. */
atomic_t preparation_error;
/*
* Barrier after preparation step in restrict_one_thread.
* The calling thread waits for completion.
*
* Re-initialized on every round of looking for newly spawned threads.
*/
atomic_t num_preparing;
struct completion all_prepared;
/* Sibling threads wait for completion. */
struct completion ready_to_commit;
/*
* Barrier after commit step (used by syscall impl to wait for
* completion).
*/
atomic_t num_unfinished;
struct completion all_finished;
};
struct tsync_work {
struct callback_head work;
struct task_struct *task;
struct tsync_shared_context *shared_ctx;
};
/*
* restrict_one_thread - update a thread's Landlock domain in lockstep with the
* other threads in the same process
*
* When this is run, the same function gets run in all other threads in the same
* process (except for the calling thread which called landlock_restrict_self).
* The concurrently running invocations of restrict_one_thread coordinate
* through the shared ctx object to do their work in lockstep to implement
* all-or-nothing semantics for enforcing the new Landlock domain.
*
* Afterwards, depending on the presence of an error, all threads either commit
* or abort the prepared credentials. The commit operation can not fail any
* more.
*/
static void restrict_one_thread(struct tsync_shared_context *ctx)
{
int err;
struct cred *cred = NULL;
if (current_cred() == ctx->old_cred) {
/*
* Switch out old_cred with new_cred, if possible.
*
* In the common case, where all threads initially point to the same
* struct cred, this optimization avoids creating separate redundant
* credentials objects for each, which would all have the same contents.
*
* Note: We are intentionally dropping the const qualifier here, because
* it is required by commit_creds() and abort_creds().
*/
cred = (struct cred *)get_cred(ctx->new_cred);
} else {
/* Else, prepare new creds and populate them. */
cred = prepare_creds();
if (!cred) {
atomic_set(&ctx->preparation_error, -ENOMEM);
/*
* Even on error, we need to adhere to the protocol and coordinate
* with concurrently running invocations.
*/
if (atomic_dec_return(&ctx->num_preparing) == 0)
complete_all(&ctx->all_prepared);
goto out;
}
landlock_cred_copy(landlock_cred(cred),
landlock_cred(ctx->new_cred));
}
/*
* Barrier: Wait until all threads are done preparing.
* After this point, we can have no more failures.
*/
if (atomic_dec_return(&ctx->num_preparing) == 0)
complete_all(&ctx->all_prepared);
/*
* Wait for signal from calling thread that it's safe to read the
* preparation error now and we are ready to commit (or abort).
*/
wait_for_completion(&ctx->ready_to_commit);
/* Abort the commit if any of the other threads had an error. */
err = atomic_read(&ctx->preparation_error);
if (err) {
abort_creds(cred);
goto out;
}
/*
* Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
* (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
* kernel/seccomp.c)
*/
if (ctx->set_no_new_privs)
task_set_no_new_privs(current);
commit_creds(cred);
out:
/* Notify the calling thread once all threads are done */
if (atomic_dec_return(&ctx->num_unfinished) == 0)
complete_all(&ctx->all_finished);
}
/*
* restrict_one_thread_callback - task_work callback for restricting a thread
*
* Calls restrict_one_thread with the struct landlock_shared_tsync_context.
*/
static void restrict_one_thread_callback(struct callback_head *work)
{
struct tsync_work *ctx = container_of(work, struct tsync_work, work);
restrict_one_thread(ctx->shared_ctx);
}
/*
* struct tsync_works - a growable array of per-task contexts
*
* The zero-initialized struct represents the empty array.
*/
struct tsync_works {
struct tsync_work **works;
size_t size;
size_t capacity;
};
/*
* tsync_works_provide - provides a preallocated tsync_work for the given task
*
* This also stores a task pointer in the context and increments the reference
* count of the task.
*
* This function may fail in the case where we did not preallocate sufficient
* capacity. This can legitimately happen if new threads get started after we
* grew the capacity.
*
* Returns:
* A pointer to the preallocated context struct, with task filled in.
*
* NULL, if we ran out of preallocated context structs.
*/
static struct tsync_work *tsync_works_provide(struct tsync_works *s,
struct task_struct *task)
{
struct tsync_work *ctx;
if (s->size >= s->capacity)
return NULL;
ctx = s->works[s->size];
s->size++;
ctx->task = get_task_struct(task);
return ctx;
}
/*
* tsync_works_grow_by - preallocates space for n more contexts in s
*
* On a successful return, the subsequent n calls to tsync_works_provide() are
* guaranteed to succeed. (size + n <= capacity)
*
* Returns:
* -ENOMEM if the (re)allocation fails
* 0 if the allocation succeeds, partially succeeds, or no reallocation
* was needed
*/
static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
{
size_t i;
size_t new_capacity;
struct tsync_work **works;
struct tsync_work *work;
if (check_add_overflow(s->size, n, &new_capacity))
return -EOVERFLOW;
/* No need to reallocate if s already has sufficient capacity. */
if (new_capacity <= s->capacity)
return 0;
works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
flags);
if (!works)
return -ENOMEM;
s->works = works;
for (i = s->capacity; i < new_capacity; i++) {
work = kzalloc(sizeof(*work), flags);
if (!work) {
/*
* Leave the object in a consistent state,
* but return an error.
*/
s->capacity = i;
return -ENOMEM;
}
s->works[i] = work;
}
s->capacity = new_capacity;
return 0;
}
/*
* tsync_works_contains - checks for presence of task in s
*/
static bool tsync_works_contains_task(const struct tsync_works *s,
struct task_struct *task)
{
size_t i;
for (i = 0; i < s->size; i++)
if (s->works[i]->task == task)
return true;
return false;
}
/*
* tsync_works_release - frees memory held by s and drops all task references
*
* This does not free s itself, only the data structures held by it.
*/
static void tsync_works_release(struct tsync_works *s)
{
size_t i;
for (i = 0; i < s->size; i++) {
if (!s->works[i]->task)
continue;
put_task_struct(s->works[i]->task);
}
for (i = 0; i < s->capacity; i++)
kfree(s->works[i]);
kfree(s->works);
s->works = NULL;
s->size = 0;
s->capacity = 0;
}
/*
* count_additional_threads - counts the sibling threads that are not in works
*/
static size_t count_additional_threads(const struct tsync_works *works)
{
struct task_struct *thread, *caller;
size_t n = 0;
caller = current;
guard(rcu)();
for_each_thread(caller, thread) {
/* Skip current, since it is initiating the sync. */
if (thread == caller)
continue;
/* Skip exited threads. */
if (thread->flags & PF_EXITING)
continue;
/* Skip threads that we have already seen. */
if (tsync_works_contains_task(works, thread))
continue;
n++;
}
return n;
}
/*
* schedule_task_work - adds task_work for all eligible sibling threads
* which have not been scheduled yet
*
* For each added task_work, atomically increments shared_ctx->num_preparing and
* shared_ctx->num_unfinished.
*
* Returns:
* true, if at least one eligible sibling thread was found
*/
static bool schedule_task_work(struct tsync_works *works,
struct tsync_shared_context *shared_ctx)
{
int err;
struct task_struct *thread, *caller;
struct tsync_work *ctx;
bool found_more_threads = false;
caller = current;
guard(rcu)();
for_each_thread(caller, thread) {
/* Skip current, since it is initiating the sync. */
if (thread == caller)
continue;
/* Skip exited threads. */
if (thread->flags & PF_EXITING)
continue;
/* Skip threads that we already looked at. */
if (tsync_works_contains_task(works, thread))
continue;
/*
* We found a sibling thread that is not doing its task_work yet, and
* which might spawn new threads before our task work runs, so we need
* at least one more round in the outer loop.
*/
found_more_threads = true;
ctx = tsync_works_provide(works, thread);
if (!ctx) {
/*
* We ran out of preallocated contexts -- we need to try again with
* this thread at a later time!
* found_more_threads is already true at this point.
*/
break;
}
ctx->shared_ctx = shared_ctx;
atomic_inc(&shared_ctx->num_preparing);
atomic_inc(&shared_ctx->num_unfinished);
init_task_work(&ctx->work, restrict_one_thread_callback);
err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
if (err) {
/*
* task_work_add() only fails if the task is about to exit. We
* checked that earlier, but it can happen as a race. Resume
* without setting an error, as the task is probably gone in the
* next loop iteration. For consistency, remove the task from ctx
* so that it does not look like we handed it a task_work.
*/
put_task_struct(ctx->task);
ctx->task = NULL;
atomic_dec(&shared_ctx->num_preparing);
atomic_dec(&shared_ctx->num_unfinished);
}
}
return found_more_threads;
}
/*
* cancel_tsync_works - cancel all task works where it is possible
*
* Task works can be canceled as long as they are still queued and have not
* started running. If they get canceled, we decrement
* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
* completions if needed, as if the task was never scheduled.
*/
static void cancel_tsync_works(struct tsync_works *works,
struct tsync_shared_context *shared_ctx)
{
int i;
for (i = 0; i < works->size; i++) {
if (!task_work_cancel(works->works[i]->task,
&works->works[i]->work))
continue;
/* After dequeueing, act as if the task work had executed. */
if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
complete_all(&shared_ctx->all_prepared);
if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
complete_all(&shared_ctx->all_finished);
}
}
/*
* restrict_sibling_threads - enables a Landlock policy for all sibling threads
*/
int landlock_restrict_sibling_threads(const struct cred *old_cred,
const struct cred *new_cred)
{
int err;
struct tsync_shared_context shared_ctx;
struct tsync_works works = {};
size_t newly_discovered_threads;
bool found_more_threads;
atomic_set(&shared_ctx.preparation_error, 0);
init_completion(&shared_ctx.all_prepared);
init_completion(&shared_ctx.ready_to_commit);
atomic_set(&shared_ctx.num_unfinished, 1);
init_completion(&shared_ctx.all_finished);
shared_ctx.old_cred = old_cred;
shared_ctx.new_cred = new_cred;
shared_ctx.set_no_new_privs = task_no_new_privs(current);
/*
* We schedule a pseudo-signal task_work for each of the calling task's
* sibling threads. In the task work, each thread:
*
* 1) runs prepare_creds() and writes back the error to
* shared_ctx.preparation_error, if needed.
*
* 2) signals that it's done with prepare_creds() to the calling task.
* (completion "all_prepared").
*
* 3) waits for the completion "ready_to_commit". This is sent by the
* calling task after ensuring that all sibling threads have done
* with the "preparation" stage.
*
* After this barrier is reached, it's safe to read
* shared_ctx.preparation_error.
*
* 4) reads shared_ctx.preparation_error and then either does commit_creds()
* or abort_creds().
*
* 5) signals that it's done altogether (barrier synchronization
* "all_finished")
*
* Unlike seccomp, which modifies sibling tasks directly, we do not need to
* acquire the cred_guard_mutex and sighand->siglock:
*
* - As in our case, all threads are themselves exchanging their own struct
* cred through the credentials API, no locks are needed for that.
* - Our for_each_thread() loops are protected by RCU.
* - We do not acquire a lock to keep the list of sibling threads stable
* between our for_each_thread loops. If the list of available sibling
* threads changes between these for_each_thread loops, we make up for
* that by continuing to look for threads until they are all discovered
* and have entered their task_work, where they are unable to spawn new
* threads.
*/
do {
/* In RCU read-lock, count the threads we need. */
newly_discovered_threads = count_additional_threads(&works);
if (newly_discovered_threads == 0)
break; /* done */
err = tsync_works_grow_by(&works, newly_discovered_threads,
GFP_KERNEL_ACCOUNT);
if (err) {
atomic_set(&shared_ctx.preparation_error, err);
break;
}
/*
* The "all_prepared" barrier is used locally to the loop body, this use
* of for_each_thread(). We can reset it on each loop iteration because
* all previous loop iterations are done with it already.
*
* num_preparing is initialized to 1 so that the counter can not go to 0
* and mark the completion as done before all task works are registered.
* We decrement it at the end of the loop body.
*/
atomic_set(&shared_ctx.num_preparing, 1);
reinit_completion(&shared_ctx.all_prepared);
/*
* In RCU read-lock, schedule task work on newly discovered sibling
* tasks.
*/
found_more_threads = schedule_task_work(&works, &shared_ctx);
/*
* Decrement num_preparing for current, to undo that we initialized it
* to 1 a few lines above.
*/
if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
if (wait_for_completion_interruptible(
&shared_ctx.all_prepared)) {
/* In case of interruption, we need to retry the system call. */
atomic_set(&shared_ctx.preparation_error,
-ERESTARTNOINTR);
/*
* Cancel task works for tasks that did not start running yet,
* and decrement all_prepared and num_unfinished accordingly.
*/
cancel_tsync_works(&works, &shared_ctx);
/*
* The remaining task works have started running, so waiting for
* their completion will finish.
*/
wait_for_completion(&shared_ctx.all_prepared);
}
}
} while (found_more_threads &&
!atomic_read(&shared_ctx.preparation_error));
/*
* We now have all sibling threads blocking and in "prepared" state in the
* task work. Ask all threads to commit.
*/
complete_all(&shared_ctx.ready_to_commit);
/*
* Decrement num_unfinished for current, to undo that we initialized it to 1
* at the beginning.
*/
if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
wait_for_completion(&shared_ctx.all_finished);
tsync_works_release(&works);
return atomic_read(&shared_ctx.preparation_error);
}