1 files changed, 131 insertions, 73 deletions
diff --git a/security/landlock/tsync.c b/security/landlock/tsync.c
index de01aa899751..c5730bbd9ed3 100644
--- a/security/landlock/tsync.c
+++ b/security/landlock/tsync.c
@@ -85,12 +85,14 @@ static void restrict_one_thread(struct tsync_shared_context *ctx)
 		/*
 		 * Switch out old_cred with new_cred, if possible.
 		 *
-		 * In the common case, where all threads initially point to the same
-		 * struct cred, this optimization avoids creating separate redundant
-		 * credentials objects for each, which would all have the same contents.
+		 * In the common case, where all threads initially point to the
+		 * same struct cred, this optimization avoids creating separate
+		 * redundant credentials objects for each, which would all have
+		 * the same contents.
 		 *
-		 * Note: We are intentionally dropping the const qualifier here, because
-		 * it is required by commit_creds() and abort_creds().
+		 * Note: We are intentionally dropping the const qualifier
+		 * here, because it is required by commit_creds() and
+		 * abort_creds().
 		 */
 		cred = (struct cred *)get_cred(ctx->new_cred);
 	} else {
@@ -101,8 +103,8 @@ static void restrict_one_thread(struct tsync_shared_context *ctx)
 			atomic_set(&ctx->preparation_error, -ENOMEM);
 
 			/*
-			 * Even on error, we need to adhere to the protocol and coordinate
-			 * with concurrently running invocations.
+			 * Even on error, we need to adhere to the protocol and
+			 * coordinate with concurrently running invocations.
 			 */
 			if (atomic_dec_return(&ctx->num_preparing) == 0)
 				complete_all(&ctx->all_prepared);
@@ -135,9 +137,9 @@ static void restrict_one_thread(struct tsync_shared_context *ctx)
 	}
 
 	/*
-	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
-	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
-	 * kernel/seccomp.c)
+	 * Make sure that all sibling tasks fulfill the no_new_privs
+	 * prerequisite.  (This is in line with Seccomp's
+	 * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c)
 	 */
 	if (ctx->set_no_new_privs)
 		task_set_no_new_privs(current);
@@ -183,10 +185,8 @@ struct tsync_works {
  * capacity.  This can legitimately happen if new threads get started after we
  * grew the capacity.
  *
- * Returns:
- *   A pointer to the preallocated context struct, with task filled in.
- *
- *   NULL, if we ran out of preallocated context structs.
+ * Return: A pointer to the preallocated context struct with task filled in, or
+ * NULL if preallocated context structs ran out.
  */
 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
 					      struct task_struct *task)
@@ -203,17 +203,49 @@ static struct tsync_work *tsync_works_provide(struct tsync_works *s,
 	return ctx;
 }
 
+/**
+ * tsync_works_trim - Put the last tsync_work element
+ *
+ * @s: TSYNC works to trim.
+ *
+ * Put the last task and decrement the size of @s.
+ *
+ * This helper does not cancel a running task, but just reset the last element
+ * to zero.
+ */
+static void tsync_works_trim(struct tsync_works *s)
+{
+	struct tsync_work *ctx;
+
+	if (WARN_ON_ONCE(s->size <= 0))
+		return;
+
+	ctx = s->works[s->size - 1];
+
+	/*
+	 * For consistency, remove the task from ctx so that it does not look
+	 * like we handed it a task_work.
+	 */
+	put_task_struct(ctx->task);
+	*ctx = (typeof(*ctx)){};
+
+	/*
+	 * Cancel the tsync_works_provide() change to recycle the reserved
+	 * memory for the next thread, if any.  This also ensures that
+	 * cancel_tsync_works() and tsync_works_release() do not see any NULL
+	 * task pointers.
+	 */
+	s->size--;
+}
+
 /*
  * tsync_works_grow_by - preallocates space for n more contexts in s
  *
  * On a successful return, the subsequent n calls to tsync_works_provide() are
  * guaranteed to succeed.  (size + n <= capacity)
  *
- * Returns:
- *   -ENOMEM if the (re)allocation fails
-
- *   0       if the allocation succeeds, partially succeeds, or no reallocation
- *           was needed
+ * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM
+ * on allocation errors, -EOVERFLOW in case of integer overflow.
  */
 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
 {
@@ -256,13 +288,14 @@ static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
  * tsync_works_contains - checks for presence of task in s
  */
 static bool tsync_works_contains_task(const struct tsync_works *s,
-				      struct task_struct *task)
+				      const struct task_struct *task)
 {
 	size_t i;
 
 	for (i = 0; i < s->size; i++)
 		if (s->works[i]->task == task)
 			return true;
+
 	return false;
 }
 
@@ -276,7 +309,7 @@ static void tsync_works_release(struct tsync_works *s)
 	size_t i;
 
 	for (i = 0; i < s->size; i++) {
-		if (!s->works[i]->task)
+		if (WARN_ON_ONCE(!s->works[i]->task))
 			continue;
 
 		put_task_struct(s->works[i]->task);
@@ -284,6 +317,7 @@ static void tsync_works_release(struct tsync_works *s)
 
 	for (i = 0; i < s->capacity; i++)
 		kfree(s->works[i]);
+
 	kfree(s->works);
 	s->works = NULL;
 	s->size = 0;
@@ -295,7 +329,7 @@ static void tsync_works_release(struct tsync_works *s)
  */
 static size_t count_additional_threads(const struct tsync_works *works)
 {
-	struct task_struct *thread, *caller;
+	const struct task_struct *caller, *thread;
 	size_t n = 0;
 
 	caller = current;
@@ -327,14 +361,15 @@ static size_t count_additional_threads(const struct tsync_works *works)
  * For each added task_work, atomically increments shared_ctx->num_preparing and
  * shared_ctx->num_unfinished.
  *
- * Returns:
- *     true, if at least one eligible sibling thread was found
+ * Return: True if at least one eligible sibling thread was found, false
+ * otherwise.
  */
 static bool schedule_task_work(struct tsync_works *works,
 			       struct tsync_shared_context *shared_ctx)
 {
 	int err;
-	struct task_struct *thread, *caller;
+	const struct task_struct *caller;
+	struct task_struct *thread;
 	struct tsync_work *ctx;
 	bool found_more_threads = false;
 
@@ -356,17 +391,17 @@ static bool schedule_task_work(struct tsync_works *works,
 			continue;
 
 		/*
-		 * We found a sibling thread that is not doing its task_work yet, and
-		 * which might spawn new threads before our task work runs, so we need
-		 * at least one more round in the outer loop.
+		 * We found a sibling thread that is not doing its task_work
+		 * yet, and which might spawn new threads before our task work
+		 * runs, so we need at least one more round in the outer loop.
 		 */
 		found_more_threads = true;
 
 		ctx = tsync_works_provide(works, thread);
 		if (!ctx) {
 			/*
-			 * We ran out of preallocated contexts -- we need to try again with
-			 * this thread at a later time!
+			 * We ran out of preallocated contexts -- we need to
+			 * try again with this thread at a later time!
 			 * found_more_threads is already true at this point.
 			 */
 			break;
@@ -379,16 +414,14 @@ static bool schedule_task_work(struct tsync_works *works,
 
 		init_task_work(&ctx->work, restrict_one_thread_callback);
 		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
-		if (err) {
+		if (unlikely(err)) {
 			/*
-			 * task_work_add() only fails if the task is about to exit.  We
-			 * checked that earlier, but it can happen as a race.  Resume
-			 * without setting an error, as the task is probably gone in the
-			 * next loop iteration.  For consistency, remove the task from ctx
-			 * so that it does not look like we handed it a task_work.
+			 * task_work_add() only fails if the task is about to
+			 * exit.  We checked that earlier, but it can happen as
+			 * a race.  Resume without setting an error, as the
+			 * task is probably gone in the next loop iteration.
 			 */
-			put_task_struct(ctx->task);
-			ctx->task = NULL;
+			tsync_works_trim(works);
 
 			atomic_dec(&shared_ctx->num_preparing);
 			atomic_dec(&shared_ctx->num_unfinished);
@@ -406,12 +439,15 @@ static bool schedule_task_work(struct tsync_works *works,
  * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
  * completions if needed, as if the task was never scheduled.
  */
-static void cancel_tsync_works(struct tsync_works *works,
+static void cancel_tsync_works(const struct tsync_works *works,
 			       struct tsync_shared_context *shared_ctx)
 {
-	int i;
+	size_t i;
 
 	for (i = 0; i < works->size; i++) {
+		if (WARN_ON_ONCE(!works->works[i]->task))
+			continue;
+
 		if (!task_work_cancel(works->works[i]->task,
 				      &works->works[i]->work))
 			continue;
@@ -448,6 +484,16 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
 	shared_ctx.set_no_new_privs = task_no_new_privs(current);
 
 	/*
+	 * Serialize concurrent TSYNC operations to prevent deadlocks when
+	 * multiple threads call landlock_restrict_self() simultaneously.
+	 * If the lock is already held, we gracefully yield by restarting the
+	 * syscall. This allows the current thread to process pending
+	 * task_works before retrying.
+	 */
+	if (!down_write_trylock(&current->signal->exec_update_lock))
+		return restart_syscall();
+
+	/*
 	 * We schedule a pseudo-signal task_work for each of the calling task's
 	 * sibling threads.  In the task work, each thread:
 	 *
@@ -464,24 +510,25 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
 	 *    After this barrier is reached, it's safe to read
 	 *    shared_ctx.preparation_error.
 	 *
-	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
-	 *    or abort_creds().
+	 * 4) reads shared_ctx.preparation_error and then either does
+	 *    commit_creds() or abort_creds().
 	 *
 	 * 5) signals that it's done altogether (barrier synchronization
 	 *    "all_finished")
 	 *
-	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
-	 * acquire the cred_guard_mutex and sighand->siglock:
+	 * Unlike seccomp, which modifies sibling tasks directly, we do not
+	 * need to acquire the cred_guard_mutex and sighand->siglock:
 	 *
-	 * - As in our case, all threads are themselves exchanging their own struct
-	 *   cred through the credentials API, no locks are needed for that.
+	 * - As in our case, all threads are themselves exchanging their own
+	 *   struct cred through the credentials API, no locks are needed for
+	 *   that.
 	 * - Our for_each_thread() loops are protected by RCU.
-	 * - We do not acquire a lock to keep the list of sibling threads stable
-	 *   between our for_each_thread loops.  If the list of available sibling
-	 *   threads changes between these for_each_thread loops, we make up for
-	 *   that by continuing to look for threads until they are all discovered
-	 *   and have entered their task_work, where they are unable to spawn new
-	 *   threads.
+	 * - We do not acquire a lock to keep the list of sibling threads
+	 *   stable between our for_each_thread loops.  If the list of
+	 *   available sibling threads changes between these for_each_thread
+	 *   loops, we make up for that by continuing to look for threads until
+	 *   they are all discovered and have entered their task_work, where
+	 *   they are unable to spawn new threads.
 	 */
 	do {
 		/* In RCU read-lock, count the threads we need. */
@@ -498,64 +545,75 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
 		}
 
 		/*
-		 * The "all_prepared" barrier is used locally to the loop body, this use
-		 * of for_each_thread().  We can reset it on each loop iteration because
-		 * all previous loop iterations are done with it already.
+		 * The "all_prepared" barrier is used locally to the loop body,
+		 * this use of for_each_thread().  We can reset it on each loop
+		 * iteration because all previous loop iterations are done with
+		 * it already.
 		 *
-		 * num_preparing is initialized to 1 so that the counter can not go to 0
-		 * and mark the completion as done before all task works are registered.
-		 * We decrement it at the end of the loop body.
+		 * num_preparing is initialized to 1 so that the counter can
+		 * not go to 0 and mark the completion as done before all task
+		 * works are registered.  We decrement it at the end of the
+		 * loop body.
 		 */
 		atomic_set(&shared_ctx.num_preparing, 1);
 		reinit_completion(&shared_ctx.all_prepared);
 
 		/*
-		 * In RCU read-lock, schedule task work on newly discovered sibling
-		 * tasks.
+		 * In RCU read-lock, schedule task work on newly discovered
+		 * sibling tasks.
 		 */
 		found_more_threads = schedule_task_work(&works, &shared_ctx);
 
 		/*
-		 * Decrement num_preparing for current, to undo that we initialized it
-		 * to 1 a few lines above.
+		 * Decrement num_preparing for current, to undo that we
+		 * initialized it to 1 a few lines above.
 		 */
 		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
 			if (wait_for_completion_interruptible(
 				    &shared_ctx.all_prepared)) {
-				/* In case of interruption, we need to retry the system call. */
+				/*
+				 * In case of interruption, we need to retry
+				 * the system call.
+				 */
 				atomic_set(&shared_ctx.preparation_error,
 					   -ERESTARTNOINTR);
 
 				/*
-				 * Cancel task works for tasks that did not start running yet,
-				 * and decrement all_prepared and num_unfinished accordingly.
+				 * Opportunistic improvement: try to cancel task
+				 * works for tasks that did not start running
+				 * yet. We do not have a guarantee that it
+				 * cancels any of the enqueued task works
+				 * because task_work_run() might already have
+				 * dequeued them.
 				 */
 				cancel_tsync_works(&works, &shared_ctx);
 
 				/*
-				 * The remaining task works have started running, so waiting for
-				 * their completion will finish.
+				 * Break the loop with error. The cleanup code
+				 * after the loop unblocks the remaining
+				 * task_works.
 				 */
-				wait_for_completion(&shared_ctx.all_prepared);
+				break;
 			}
 		}
 	} while (found_more_threads &&
 		 !atomic_read(&shared_ctx.preparation_error));
 
 	/*
-	 * We now have all sibling threads blocking and in "prepared" state in the
-	 * task work. Ask all threads to commit.
+	 * We now have either (a) all sibling threads blocking and in "prepared"
+	 * state in the task work, or (b) the preparation error is set. Ask all
+	 * threads to commit (or abort).
 	 */
 	complete_all(&shared_ctx.ready_to_commit);
 
 	/*
-	 * Decrement num_unfinished for current, to undo that we initialized it to 1
-	 * at the beginning.
+	 * Decrement num_unfinished for current, to undo that we initialized it
+	 * to 1 at the beginning.
 	 */
 	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
 		wait_for_completion(&shared_ctx.all_finished);
 
 	tsync_works_release(&works);
-
+	up_write(&current->signal->exec_update_lock);
 	return atomic_read(&shared_ctx.preparation_error);
 }