From 21b195c05cf6a6cc49777d6992772bcf01502186 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 23 Dec 2021 20:31:37 +0800
Subject: workqueue: Remove the mb() pair between wq_worker_sleeping() and
 insert_work()

In wq_worker_sleeping(), the access to worklist is protected by the
pool->lock, so the memory barrier is unneeded.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33f1106b4f99..29b070106f34 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -918,10 +918,6 @@ void wq_worker_sleeping(struct task_struct *task)
 	}
 
 	/*
-	 * The counterpart of the following dec_and_test, implied mb,
-	 * worklist not empty test sequence is in insert_work().
-	 * Please read comment there.
-	 *
 	 * NOT_RUNNING is clear.  This means that we're bound to and
 	 * running on the local cpu w/ rq lock held and preemption
 	 * disabled, which in turn means that none else could be
@@ -1372,13 +1368,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
 	list_add_tail(&work->entry, head);
 	get_pwq(pwq);
 
-	/*
-	 * Ensure either wq_worker_sleeping() sees the above
-	 * list_add_tail() or we see zero nr_running to avoid workers lying
-	 * around lazily while there are works to be processed.
-	 */
-	smp_mb();
-
 	if (__need_more_worker(pool))
 		wake_up_worker(pool);
 }
-- 
cgit v1.2.3


From 2c1f1a9180bfacbc3c8e5b10075640cc810cf9c0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 23 Dec 2021 20:31:38 +0800
Subject: workqueue: Change the comments of the synchronization about the
 idle_list

The access to idle_list in wq_worker_sleeping() is changed to be
protected by pool->lock, so the comments above idle_list can be changed
to "L:" which is the meaning of "access with pool->lock held".

And the outdated comments in wq_worker_sleeping() is removed since
the function is not called with rq lock held any more, idle_list is
dereferenced with pool lock now.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29b070106f34..b3207722671c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -162,7 +162,7 @@ struct worker_pool {
 	int			nr_workers;	/* L: total number of workers */
 	int			nr_idle;	/* L: currently idle workers */
 
-	struct list_head	idle_list;	/* X: list of idle workers */
+	struct list_head	idle_list;	/* L: list of idle workers */
 	struct timer_list	idle_timer;	/* L: worker idle timeout */
 	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
@@ -826,7 +826,7 @@ static bool too_many_workers(struct worker_pool *pool)
  * Wake up functions.
  */
 
-/* Return the first idle worker.  Safe with preemption disabled */
+/* Return the first idle worker.  Called with pool->lock held. */
 static struct worker *first_idle_worker(struct worker_pool *pool)
 {
 	if (unlikely(list_empty(&pool->idle_list)))
@@ -917,13 +917,6 @@ void wq_worker_sleeping(struct task_struct *task)
 		return;
 	}
 
-	/*
-	 * NOT_RUNNING is clear.  This means that we're bound to and
-	 * running on the local cpu w/ rq lock held and preemption
-	 * disabled, which in turn means that none else could be
-	 * manipulating idle_list, so dereferencing idle_list without pool
-	 * lock is safe.
-	 */
 	if (atomic_dec_and_test(&pool->nr_running) &&
 	    !list_empty(&pool->worklist)) {
 		next = first_idle_worker(pool);
-- 
cgit v1.2.3


From cc5bff38463e0894dd596befa99f9d6860e15f5e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 23 Dec 2021 20:31:39 +0800
Subject: workqueue: Use wake_up_worker() in wq_worker_sleeping() instead of
 open code

The wakeup code in wq_worker_sleeping() is the same as wake_up_worker().

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b3207722671c..69cbe9e62bf1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -887,7 +887,7 @@ void wq_worker_running(struct task_struct *task)
  */
 void wq_worker_sleeping(struct task_struct *task)
 {
-	struct worker *next, *worker = kthread_data(task);
+	struct worker *worker = kthread_data(task);
 	struct worker_pool *pool;
 
 	/*
@@ -918,11 +918,8 @@ void wq_worker_sleeping(struct task_struct *task)
 	}
 
 	if (atomic_dec_and_test(&pool->nr_running) &&
-	    !list_empty(&pool->worklist)) {
-		next = first_idle_worker(pool);
-		if (next)
-			wake_up_process(next->task);
-	}
+	    !list_empty(&pool->worklist))
+		wake_up_worker(pool);
 	raw_spin_unlock_irq(&pool->lock);
 }
 
-- 
cgit v1.2.3


From bc35f7ef96284b8c963991357a9278a6beafca54 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@linux.alibaba.com>
Date: Thu, 23 Dec 2021 20:31:40 +0800
Subject: workqueue: Convert the type of pool->nr_running to int

It is only modified in associated CPU, so it doesn't need to be atomic.

tj: Comment updated.

Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 69cbe9e62bf1..835d25e65bb2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,8 +154,13 @@ struct worker_pool {
 
 	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
 
-	/* The current concurrency level. */
-	atomic_t		nr_running;
+	/*
+	 * The counter is incremented in a process context on the associated CPU
+	 * w/ preemption disabled, and decremented or reset in the same context
+	 * but w/ pool->lock held. The readers grab pool->lock and are
+	 * guaranteed to see if the counter reached zero.
+	 */
+	int			nr_running;
 
 	struct list_head	worklist;	/* L: list of pending works */
 
@@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work)
 
 static bool __need_more_worker(struct worker_pool *pool)
 {
-	return !atomic_read(&pool->nr_running);
+	return !pool->nr_running;
 }
 
 /*
@@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool)
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
-	return !list_empty(&pool->worklist) &&
-		atomic_read(&pool->nr_running) <= 1;
+	return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
 }
 
 /* Do we need a new worker?  Called from manager. */
@@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task)
 	 */
 	preempt_disable();
 	if (!(worker->flags & WORKER_NOT_RUNNING))
-		atomic_inc(&worker->pool->nr_running);
+		worker->pool->nr_running++;
 	preempt_enable();
 	worker->sleeping = 0;
 }
@@ -917,8 +921,8 @@ void wq_worker_sleeping(struct task_struct *task)
 		return;
 	}
 
-	if (atomic_dec_and_test(&pool->nr_running) &&
-	    !list_empty(&pool->worklist))
+	pool->nr_running--;
+	if (need_more_worker(pool))
 		wake_up_worker(pool);
 	raw_spin_unlock_irq(&pool->lock);
 }
@@ -973,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
 	/* If transitioning into NOT_RUNNING, adjust nr_running. */
 	if ((flags & WORKER_NOT_RUNNING) &&
 	    !(worker->flags & WORKER_NOT_RUNNING)) {
-		atomic_dec(&pool->nr_running);
+		pool->nr_running--;
 	}
 
 	worker->flags |= flags;
@@ -1005,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
-			atomic_inc(&pool->nr_running);
+			pool->nr_running++;
 }
 
 /**
@@ -1806,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker)
 		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
 
 	/* Sanity check nr_running. */
-	WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
-		     atomic_read(&pool->nr_running));
+	WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
 }
 
 /**
@@ -4985,7 +4988,7 @@ static void unbind_workers(int cpu)
 		 * an unbound (in terms of concurrency management) pool which
 		 * are served by workers tied to the pool.
 		 */
-		atomic_set(&pool->nr_running, 0);
+		pool->nr_running = 0;
 
 		/*
 		 * With concurrency management just turned off, a busy
-- 
cgit v1.2.3


From 4148be7de0a3316edd1af45609d354cac0e6a021 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Sat, 8 Jan 2022 00:38:16 +0000
Subject: cgroup: rstat: use same convention to assign cgroup_base_stat

In function cgroup_base_stat_flush(), we update cgroup_base_stat by
getting rstatc->bstat and adjust delta to related fields.

There are two convention to assign cgroup_base_stat in this function:

  * rstat2 = rstat1
  * rstat2.cputime = rstat1.cputime

The second convention may make audience think just field "cputime" is
updated, while cputime is the only field in cgroup_base_stat.

Let's use the same convention to eliminate this confusion.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/rstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..0b32fa62e93c 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -325,7 +325,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 	/* fetch the current per-cpu values */
 	do {
 		seq = __u64_stats_fetch_begin(&rstatc->bsync);
-		cur.cputime = rstatc->bstat.cputime;
+		cur = rstatc->bstat;
 	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 
 	/* propagate percpu delta to global */
-- 
cgit v1.2.3


From 95b99f353cf3d9d753796975f2093b13f75e0fbd Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Sat, 8 Jan 2022 00:38:17 +0000
Subject: cgroup: rstat: retrieve current bstat to delta directly

Instead of retrieve current bstat to cur and copy it to delta, let's use
delta directly.

This saves one copy operation and has the same code convention as
propagating delta to parent.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/rstat.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 0b32fa62e93c..29ea74f0eab3 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -315,7 +315,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 {
 	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 	struct cgroup *parent = cgroup_parent(cgrp);
-	struct cgroup_base_stat cur, delta;
+	struct cgroup_base_stat delta;
 	unsigned seq;
 
 	/* Root-level stats are sourced from system-wide CPU stats */
@@ -325,11 +325,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 	/* fetch the current per-cpu values */
 	do {
 		seq = __u64_stats_fetch_begin(&rstatc->bsync);
-		cur = rstatc->bstat;
+		delta = rstatc->bstat;
 	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 
 	/* propagate percpu delta to global */
-	delta = cur;
 	cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 	cgroup_base_stat_add(&cgrp->bstat, &delta);
 	cgroup_base_stat_add(&rstatc->last_bstat, &delta);
-- 
cgit v1.2.3


From ffacbd11e2580a93546733d2e4e320c181f76844 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Sat, 8 Jan 2022 14:38:12 +0800
Subject: cgroup: Fix cgroup_can_fork() and cgroup_post_fork() kernel-doc
 comment

Add the description of @kargs in cgroup_can_fork() and
cgroup_post_fork() kernel-doc comment to remove warnings found
by running scripts/kernel-doc, which is caused by using 'make W=1'.

kernel/cgroup/cgroup.c:6235: warning: Function parameter or member
'kargs' not described in 'cgroup_can_fork'
kernel/cgroup/cgroup.c:6296: warning: Function parameter or member
'kargs' not described in 'cgroup_post_fork'

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b31e1465868a..37c49e1a672f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6224,6 +6224,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
 /**
  * cgroup_can_fork - called on a new task before the process is exposed
  * @child: the child process
+ * @kargs: the arguments passed to create the child process
  *
  * This prepares a new css_set for the child process which the child will
  * be attached to in cgroup_post_fork().
@@ -6286,6 +6287,7 @@ void cgroup_cancel_fork(struct task_struct *child,
 /**
  * cgroup_post_fork - finalize cgroup setup for the child process
  * @child: the child process
+ * @kargs: the arguments passed to create the child process
  *
  * Attach the child process to its css_set calling the subsystem fork()
  * callbacks.
-- 
cgit v1.2.3


From 18688de203b47e5d8d9d0953385bf30b5949324f Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:44 +0530
Subject: bpf: Fix UAF due to race between btf_try_get_module and load_module

While working on code to populate kfunc BTF ID sets for module BTF from
its initcall, I noticed that by the time the initcall is invoked, the
module BTF can already be seen by userspace (and the BPF verifier). The
existing btf_try_get_module calls try_module_get which only fails if
mod->state == MODULE_STATE_GOING, i.e. it can increment module reference
when module initcall is happening in parallel.

Currently, BTF parsing happens from MODULE_STATE_COMING notifier
callback. At this point, the module initcalls have not been invoked.
The notifier callback parses and prepares the module BTF, allocates an
ID, which publishes it to userspace, and then adds it to the btf_modules
list allowing the kernel to invoke btf_try_get_module for the BTF.

However, at this point, the module has not been fully initialized (i.e.
its initcalls have not finished). The code in module.c can still fail
and free the module, without caring for other users. However, nothing
stops btf_try_get_module from succeeding between the state transition
from MODULE_STATE_COMING to MODULE_STATE_LIVE.

This leads to a use-after-free issue when BPF program loads
successfully in the state transition, load_module's do_init_module call
fails and frees the module, and BPF program fd on close calls module_put
for the freed module. Future patch has test case to verify we don't
regress in this area in future.

There are multiple points after prepare_coming_module (in load_module)
where failure can occur and module loading can return error. We
illustrate and test for the race using the last point where it can
practically occur (in module __init function).

An illustration of the race:

CPU 0                           CPU 1
			  load_module
			    notifier_call(MODULE_STATE_COMING)
			      btf_parse_module
			      btf_alloc_id	// Published to userspace
			      list_add(&btf_mod->list, btf_modules)
			    mod->init(...)
...				^
bpf_check		        |
check_pseudo_btf_id             |
  btf_try_get_module            |
    returns true                |  ...
...                             |  module __init in progress
return prog_fd                  |  ...
...                             V
			    if (ret < 0)
			      free_module(mod)
			    ...
close(prog_fd)
 ...
 bpf_prog_free_deferred
  module_put(used_btf.mod) // use-after-free

We fix this issue by setting a flag BTF_MODULE_F_LIVE, from the notifier
callback when MODULE_STATE_LIVE state is reached for the module, so that
we return NULL from btf_try_get_module for modules that are not fully
formed. Since try_module_get already checks that module is not in
MODULE_STATE_GOING state, and that is the only transition a live module
can make before being removed from btf_modules list, this is enough to
close the race and prevent the bug.

A later selftest patch crafts the race condition artifically to verify
that it has been fixed, and that verifier fails to load program (with
ENXIO).

Lastly, a couple of comments:

 1. Even if this race didn't exist, it seems more appropriate to only
    access resources (ksyms and kfuncs) of a fully formed module which
    has been initialized completely.

 2. This patch was born out of need for synchronization against module
    initcall for the next patch, so it is needed for correctness even
    without the aforementioned race condition. The BTF resources
    initialized by module initcall are set up once and then only looked
    up, so just waiting until the initcall has finished ensures correct
    behavior.

Fixes: 541c3bad8dc5 ("bpf: Support BPF ksym variables in kernel modules")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 33bb8ae4a804..f25bca59909d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6200,12 +6200,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
 	return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
 }
 
+enum {
+	BTF_MODULE_F_LIVE = (1 << 0),
+};
+
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 struct btf_module {
 	struct list_head list;
 	struct module *module;
 	struct btf *btf;
 	struct bin_attribute *sysfs_attr;
+	int flags;
 };
 
 static LIST_HEAD(btf_modules);
@@ -6233,7 +6238,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
 	int err = 0;
 
 	if (mod->btf_data_size == 0 ||
-	    (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+	    (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+	     op != MODULE_STATE_GOING))
 		goto out;
 
 	switch (op) {
@@ -6291,6 +6297,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
 			btf_mod->sysfs_attr = attr;
 		}
 
+		break;
+	case MODULE_STATE_LIVE:
+		mutex_lock(&btf_module_mutex);
+		list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+			if (btf_mod->module != module)
+				continue;
+
+			btf_mod->flags |= BTF_MODULE_F_LIVE;
+			break;
+		}
+		mutex_unlock(&btf_module_mutex);
 		break;
 	case MODULE_STATE_GOING:
 		mutex_lock(&btf_module_mutex);
@@ -6338,7 +6355,12 @@ struct module *btf_try_get_module(const struct btf *btf)
 		if (btf_mod->btf != btf)
 			continue;
 
-		if (try_module_get(btf_mod->module))
+		/* We must only consider module whose __init routine has
+		 * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+		 * which is set from the notifier callback for
+		 * MODULE_STATE_LIVE.
+		 */
+		if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
 			res = btf_mod->module;
 
 		break;
-- 
cgit v1.2.3


From dee872e124e8d5de22b68c58f6f6c3f5e8889160 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:45 +0530
Subject: bpf: Populate kfunc BTF ID sets in struct btf

This patch prepares the kernel to support putting all kinds of kfunc BTF
ID sets in the struct btf itself. The various kernel subsystems will
make register_btf_kfunc_id_set call in the initcalls (for built-in code
and modules).

The 'hook' is one of the many program types, e.g. XDP and TC/SCHED_CLS,
STRUCT_OPS, and 'types' are check (allowed or not), acquire, release,
and ret_null (with PTR_TO_BTF_ID_OR_NULL return type).

A maximum of BTF_KFUNC_SET_MAX_CNT (32) kfunc BTF IDs are permitted in a
set of certain hook and type for vmlinux sets, since they are allocated
on demand, and otherwise set as NULL. Module sets can only be registered
once per hook and type, hence they are directly assigned.

A new btf_kfunc_id_set_contains function is exposed for use in verifier,
this new method is faster than the existing list searching method, and
is also automatic. It also lets other code not care whether the set is
unallocated or not.

Note that module code can only do single register_btf_kfunc_id_set call
per hook. This is why sorting is only done for in-kernel vmlinux sets,
because there might be multiple sets for the same hook and type that
must be concatenated, hence sorting them is required to ensure bsearch
in btf_id_set_contains continues to work correctly.

Next commit will update the kernel users to make use of this
infrastructure.

Finally, add __maybe_unused annotation for BTF ID macros for the
!CONFIG_DEBUG_INFO_BTF case, so that they don't produce warnings during
build time.

The previous patch is also needed to provide synchronization against
initialization for module BTF's kfunc_set_tab introduced here, as
described below:

  The kfunc_set_tab pointer in struct btf is write-once (if we consider
  the registration phase (comprised of multiple register_btf_kfunc_id_set
  calls) as a single operation). In this sense, once it has been fully
  prepared, it isn't modified, only used for lookup (from the verifier
  context).

  For btf_vmlinux, it is initialized fully during the do_initcalls phase,
  which happens fairly early in the boot process, before any processes are
  present. This also eliminates the possibility of bpf_check being called
  at that point, thus relieving us of ensuring any synchronization between
  the registration and lookup function (btf_kfunc_id_set_contains).

  However, the case for module BTF is a bit tricky. The BTF is parsed,
  prepared, and published from the MODULE_STATE_COMING notifier callback.
  After this, the module initcalls are invoked, where our registration
  function will be called to populate the kfunc_set_tab for module BTF.

  At this point, BTF may be available to userspace while its corresponding
  module is still intializing. A BTF fd can then be passed to verifier
  using bpf syscall (e.g. for kfunc call insn).

  Hence, there is a race window where verifier may concurrently try to
  lookup the kfunc_set_tab. To prevent this race, we must ensure the
  operations are serialized, or waiting for the __init functions to
  complete.

  In the earlier registration API, this race was alleviated as verifier
  bpf_check_mod_kfunc_call didn't find the kfunc BTF ID until it was added
  by the registration function (called usually at the end of module __init
  function after all module resources have been initialized). If the
  verifier made the check_kfunc_call before kfunc BTF ID was added to the
  list, it would fail verification (saying call isn't allowed). The
  access to list was protected using a mutex.

  Now, it would still fail verification, but for a different reason
  (returning ENXIO due to the failed btf_try_get_module call in
  add_kfunc_call), because if the __init call is in progress the module
  will be in the middle of MODULE_STATE_COMING -> MODULE_STATE_LIVE
  transition, and the BTF_MODULE_LIVE flag for btf_module instance will
  not be set, so the btf_try_get_module call will fail.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f25bca59909d..74037bd65d17 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -198,6 +198,21 @@
 DEFINE_IDR(btf_idr);
 DEFINE_SPINLOCK(btf_idr_lock);
 
+enum btf_kfunc_hook {
+	BTF_KFUNC_HOOK_XDP,
+	BTF_KFUNC_HOOK_TC,
+	BTF_KFUNC_HOOK_STRUCT_OPS,
+	BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+	BTF_KFUNC_SET_MAX_CNT = 32,
+};
+
+struct btf_kfunc_set_tab {
+	struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+};
+
 struct btf {
 	void *data;
 	struct btf_type **types;
@@ -212,6 +227,7 @@ struct btf {
 	refcount_t refcnt;
 	u32 id;
 	struct rcu_head rcu;
+	struct btf_kfunc_set_tab *kfunc_set_tab;
 
 	/* split BTF support */
 	struct btf *base_btf;
@@ -1531,8 +1547,30 @@ static void btf_free_id(struct btf *btf)
 	spin_unlock_irqrestore(&btf_idr_lock, flags);
 }
 
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+	struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+	int hook, type;
+
+	if (!tab)
+		return;
+	/* For module BTF, we directly assign the sets being registered, so
+	 * there is nothing to free except kfunc_set_tab.
+	 */
+	if (btf_is_module(btf))
+		goto free_tab;
+	for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
+		for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
+			kfree(tab->sets[hook][type]);
+	}
+free_tab:
+	kfree(tab);
+	btf->kfunc_set_tab = NULL;
+}
+
 static void btf_free(struct btf *btf)
 {
+	btf_free_kfunc_set_tab(btf);
 	kvfree(btf->types);
 	kvfree(btf->resolved_sizes);
 	kvfree(btf->resolved_ids);
@@ -6371,6 +6409,36 @@ struct module *btf_try_get_module(const struct btf *btf)
 	return res;
 }
 
+/* Returns struct btf corresponding to the struct module
+ *
+ * This function can return NULL or ERR_PTR. Note that caller must
+ * release reference for struct btf iff btf_is_module is true.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+	struct btf *btf = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	struct btf_module *btf_mod, *tmp;
+#endif
+
+	if (!module)
+		return bpf_get_btf_vmlinux();
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	mutex_lock(&btf_module_mutex);
+	list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+		if (btf_mod->module != module)
+			continue;
+
+		btf_get(btf_mod->btf);
+		btf = btf_mod->btf;
+		break;
+	}
+	mutex_unlock(&btf_module_mutex);
+#endif
+
+	return btf;
+}
+
 BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
 {
 	struct btf *btf;
@@ -6438,7 +6506,181 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
-/* BTF ID set registration API for modules */
+/* Kernel Function (kfunc) BTF ID set registration API */
+
+static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+				    enum btf_kfunc_type type,
+				    struct btf_id_set *add_set, bool vmlinux_set)
+{
+	struct btf_kfunc_set_tab *tab;
+	struct btf_id_set *set;
+	u32 set_cnt;
+	int ret;
+
+	if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	if (!add_set->cnt)
+		return 0;
+
+	tab = btf->kfunc_set_tab;
+	if (!tab) {
+		tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+		if (!tab)
+			return -ENOMEM;
+		btf->kfunc_set_tab = tab;
+	}
+
+	set = tab->sets[hook][type];
+	/* Warn when register_btf_kfunc_id_set is called twice for the same hook
+	 * for module sets.
+	 */
+	if (WARN_ON_ONCE(set && !vmlinux_set)) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	/* We don't need to allocate, concatenate, and sort module sets, because
+	 * only one is allowed per hook. Hence, we can directly assign the
+	 * pointer and return.
+	 */
+	if (!vmlinux_set) {
+		tab->sets[hook][type] = add_set;
+		return 0;
+	}
+
+	/* In case of vmlinux sets, there may be more than one set being
+	 * registered per hook. To create a unified set, we allocate a new set
+	 * and concatenate all individual sets being registered. While each set
+	 * is individually sorted, they may become unsorted when concatenated,
+	 * hence re-sorting the final set again is required to make binary
+	 * searching the set using btf_id_set_contains function work.
+	 */
+	set_cnt = set ? set->cnt : 0;
+
+	if (set_cnt > U32_MAX - add_set->cnt) {
+		ret = -EOVERFLOW;
+		goto end;
+	}
+
+	if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+		ret = -E2BIG;
+		goto end;
+	}
+
+	/* Grow set */
+	set = krealloc(tab->sets[hook][type],
+		       offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+		       GFP_KERNEL | __GFP_NOWARN);
+	if (!set) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	/* For newly allocated set, initialize set->cnt to 0 */
+	if (!tab->sets[hook][type])
+		set->cnt = 0;
+	tab->sets[hook][type] = set;
+
+	/* Concatenate the two sets */
+	memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+	set->cnt += add_set->cnt;
+
+	sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+
+	return 0;
+end:
+	btf_free_kfunc_set_tab(btf);
+	return ret;
+}
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+				  const struct btf_kfunc_id_set *kset)
+{
+	bool vmlinux_set = !btf_is_module(btf);
+	int type, ret;
+
+	for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
+		if (!kset->sets[type])
+			continue;
+
+		ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+					enum btf_kfunc_hook hook,
+					enum btf_kfunc_type type,
+					u32 kfunc_btf_id)
+{
+	struct btf_id_set *set;
+
+	if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
+		return false;
+	if (!btf->kfunc_set_tab)
+		return false;
+	set = btf->kfunc_set_tab->sets[hook][type];
+	if (!set)
+		return false;
+	return btf_id_set_contains(set, kfunc_btf_id);
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+	switch (prog_type) {
+	case BPF_PROG_TYPE_XDP:
+		return BTF_KFUNC_HOOK_XDP;
+	case BPF_PROG_TYPE_SCHED_CLS:
+		return BTF_KFUNC_HOOK_TC;
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		return BTF_KFUNC_HOOK_STRUCT_OPS;
+	default:
+		return BTF_KFUNC_HOOK_MAX;
+	}
+}
+
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+			       enum bpf_prog_type prog_type,
+			       enum btf_kfunc_type type, u32 kfunc_btf_id)
+{
+	enum btf_kfunc_hook hook;
+
+	hook = bpf_prog_type_to_kfunc_hook(prog_type);
+	return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+			      const struct btf_kfunc_id_set *kset)
+{
+	enum btf_kfunc_hook hook;
+	struct btf *btf;
+	int ret;
+
+	btf = btf_get_module_btf(kset->owner);
+	if (IS_ERR_OR_NULL(btf))
+		return btf ? PTR_ERR(btf) : -ENOENT;
+
+	hook = bpf_prog_type_to_kfunc_hook(prog_type);
+	ret = btf_populate_kfunc_set(btf, hook, kset);
+	/* reference is only taken for module BTF */
+	if (btf_is_module(btf))
+		btf_put(btf);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 
-- 
cgit v1.2.3


From b202d84422223b7222cba5031d182f20b37e146e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:46 +0530
Subject: bpf: Remove check_kfunc_call callback and old kfunc BTF ID API

Completely remove the old code for check_kfunc_call to help it work
with modules, and also the callback itself.

The previous commit adds infrastructure to register all sets and put
them in vmlinux or module BTF, and concatenates all related sets
organized by the hook and the type. Once populated, these sets remain
immutable for the lifetime of the struct btf.

Also, since we don't need the 'owner' module anywhere when doing
check_kfunc_call, drop the 'btf_modp' module parameter from
find_kfunc_desc_btf.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 46 ----------------------------------------------
 kernel/bpf/verifier.c | 20 ++++++++------------
 2 files changed, 8 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 74037bd65d17..4be5cf629ca9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6682,52 +6682,6 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 }
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-			       struct kfunc_btf_id_set *s)
-{
-	mutex_lock(&l->mutex);
-	list_add(&s->list, &l->list);
-	mutex_unlock(&l->mutex);
-}
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
-
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
-				 struct kfunc_btf_id_set *s)
-{
-	mutex_lock(&l->mutex);
-	list_del_init(&s->list);
-	mutex_unlock(&l->mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
-
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
-			      struct module *owner)
-{
-	struct kfunc_btf_id_set *s;
-
-	mutex_lock(&klist->mutex);
-	list_for_each_entry(s, &klist->list, list) {
-		if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
-			mutex_unlock(&klist->mutex);
-			return true;
-		}
-	}
-	mutex_unlock(&klist->mutex);
-	return false;
-}
-
-#define DEFINE_KFUNC_BTF_ID_LIST(name)                                         \
-	struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list),           \
-					  __MUTEX_INITIALIZER(name.mutex) };   \
-	EXPORT_SYMBOL_GPL(name)
-
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
-
-#endif
-
 int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
 			      const struct btf *targ_btf, __u32 targ_id)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bfb45381fb3f..72802c1eb5ac 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1741,7 +1741,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
 }
 
 static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
-					 s16 offset, struct module **btf_modp)
+					 s16 offset)
 {
 	struct bpf_kfunc_btf kf_btf = { .offset = offset };
 	struct bpf_kfunc_btf_tab *tab;
@@ -1795,8 +1795,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
 		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
 		     kfunc_btf_cmp_by_off, NULL);
 	}
-	if (btf_modp)
-		*btf_modp = b->module;
 	return b->btf;
 }
 
@@ -1813,8 +1811,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
 }
 
 static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
-				       u32 func_id, s16 offset,
-				       struct module **btf_modp)
+				       u32 func_id, s16 offset)
 {
 	if (offset) {
 		if (offset < 0) {
@@ -1825,7 +1822,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
 			return ERR_PTR(-EINVAL);
 		}
 
-		return __find_kfunc_desc_btf(env, offset, btf_modp);
+		return __find_kfunc_desc_btf(env, offset);
 	}
 	return btf_vmlinux ?: ERR_PTR(-ENOENT);
 }
@@ -1888,7 +1885,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 		prog_aux->kfunc_btf_tab = btf_tab;
 	}
 
-	desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+	desc_btf = find_kfunc_desc_btf(env, func_id, offset);
 	if (IS_ERR(desc_btf)) {
 		verbose(env, "failed to find BTF for kernel function\n");
 		return PTR_ERR(desc_btf);
@@ -2349,7 +2346,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
 	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
 		return NULL;
 
-	desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+	desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
 	if (IS_ERR(desc_btf))
 		return "<error>";
 
@@ -6820,7 +6817,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
 	u32 i, nargs, func_id, ptr_type_id;
-	struct module *btf_mod = NULL;
 	const struct btf_param *args;
 	struct btf *desc_btf;
 	int err;
@@ -6829,7 +6825,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (!insn->imm)
 		return 0;
 
-	desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+	desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
 	if (IS_ERR(desc_btf))
 		return PTR_ERR(desc_btf);
 
@@ -6838,8 +6834,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	func_name = btf_name_by_offset(desc_btf, func->name_off);
 	func_proto = btf_type_by_id(desc_btf, func->type);
 
-	if (!env->ops->check_kfunc_call ||
-	    !env->ops->check_kfunc_call(func_id, btf_mod)) {
+	if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+				      BTF_KFUNC_TYPE_CHECK, func_id)) {
 		verbose(env, "calling kernel function %s is not allowed\n",
 			func_name);
 		return -EACCES;
-- 
cgit v1.2.3


From d583691c47dc0424ebe926000339a6d6cd590ff7 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:47 +0530
Subject: bpf: Introduce mem, size argument pair support for kfunc

BPF helpers can associate two adjacent arguments together to pass memory
of certain size, using ARG_PTR_TO_MEM and ARG_CONST_SIZE arguments.
Since we don't use bpf_func_proto for kfunc, we need to leverage BTF to
implement similar support.

The ARG_CONST_SIZE processing for helpers is refactored into a common
check_mem_size_reg helper that is shared with kfunc as well. kfunc
ptr_to_mem support follows logic similar to global functions, where
verification is done as if pointer is not null, even when it may be
null.

This leads to a simple to follow rule for writing kfunc: always check
the argument pointer for NULL, except when it is PTR_TO_CTX. Also, the
PTR_TO_CTX case is also only safe when the helper expecting pointer to
program ctx is not exposed to other programs where same struct is not
ctx type. In that case, the type check will fall through to other cases
and would permit passing other types of pointers, possibly NULL at
runtime.

Currently, we require the size argument to be suffixed with "__sz" in
the parameter name. This information is then recorded in kernel BTF and
verified during function argument checking. In the future we can use BTF
tagging instead, and modify the kernel function definitions. This will
be a purely kernel-side change.

This allows us to have some form of backwards compatibility for
structures that are passed in to the kernel function with their size,
and allow variable length structures to be passed in if they are
accompanied by a size parameter.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      |  48 +++++++++++++++++--
 kernel/bpf/verifier.c | 124 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 124 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4be5cf629ca9..cf46694cb266 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5654,6 +5654,32 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
 	return true;
 }
 
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+				  const struct btf_param *arg,
+				  const struct bpf_reg_state *reg)
+{
+	int len, sfx_len = sizeof("__sz") - 1;
+	const struct btf_type *t;
+	const char *param_name;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	/* In the future, this can be ported to use BTF tagging */
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (str_is_empty(param_name))
+		return false;
+	len = strlen(param_name);
+	if (len < sfx_len)
+		return false;
+	param_name += len - sfx_len;
+	if (strncmp(param_name, "__sz", sfx_len))
+		return false;
+
+	return true;
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -5765,17 +5791,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			u32 type_size;
 
 			if (is_kfunc) {
+				bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
 				/* Permit pointer to mem, but only when argument
 				 * type is pointer to scalar, or struct composed
 				 * (recursively) of scalars.
+				 * When arg_mem_size is true, the pointer can be
+				 * void *.
 				 */
 				if (!btf_type_is_scalar(ref_t) &&
-				    !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+				    !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+				    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
 					bpf_log(log,
-						"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
-						i, btf_type_str(ref_t), ref_tname);
+						"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+						i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
 					return -EINVAL;
 				}
+
+				/* Check for mem, len pair */
+				if (arg_mem_size) {
+					if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+						bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+							i, i + 1);
+						return -EINVAL;
+					}
+					i++;
+					continue;
+				}
 			}
 
 			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 72802c1eb5ac..2b186185b6b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4864,6 +4864,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg, u32 regno,
+			      bool zero_size_allowed,
+			      struct bpf_call_arg_meta *meta)
+{
+	int err;
+
+	/* This is used to refine r0 return value bounds for helpers
+	 * that enforce this value as an upper bound on return values.
+	 * See do_refine_retval_range() for helpers that can refine
+	 * the return value. C type of helper is u32 so we pull register
+	 * bound from umax_value however, if negative verifier errors
+	 * out. Only upper bounds can be learned because retval is an
+	 * int type and negative retvals are allowed.
+	 */
+	if (meta)
+		meta->msize_max_value = reg->umax_value;
+
+	/* The register is SCALAR_VALUE; the access check
+	 * happens using its boundaries.
+	 */
+	if (!tnum_is_const(reg->var_off))
+		/* For unprivileged variable accesses, disable raw
+		 * mode so that the program is required to
+		 * initialize all the memory that the helper could
+		 * just partially fill up.
+		 */
+		meta = NULL;
+
+	if (reg->smin_value < 0) {
+		verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+			regno);
+		return -EACCES;
+	}
+
+	if (reg->umin_value == 0) {
+		err = check_helper_mem_access(env, regno - 1, 0,
+					      zero_size_allowed,
+					      meta);
+		if (err)
+			return err;
+	}
+
+	if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+		verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			regno);
+		return -EACCES;
+	}
+	err = check_helper_mem_access(env, regno - 1,
+				      reg->umax_value,
+				      zero_size_allowed, meta);
+	if (!err)
+		err = mark_chain_precision(env, regno);
+	return err;
+}
+
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size)
 {
@@ -4887,6 +4943,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return check_helper_mem_access(env, regno, mem_size, true, NULL);
 }
 
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			     u32 regno)
+{
+	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+	bool may_be_null = type_may_be_null(mem_reg->type);
+	struct bpf_reg_state saved_reg;
+	int err;
+
+	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+	if (may_be_null) {
+		saved_reg = *mem_reg;
+		mark_ptr_not_null_reg(mem_reg);
+	}
+
+	err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+	if (may_be_null)
+		*mem_reg = saved_reg;
+	return err;
+}
+
 /* Implementation details:
  * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
  * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -5408,51 +5486,7 @@ skip_type_check:
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-		/* This is used to refine r0 return value bounds for helpers
-		 * that enforce this value as an upper bound on return values.
-		 * See do_refine_retval_range() for helpers that can refine
-		 * the return value. C type of helper is u32 so we pull register
-		 * bound from umax_value however, if negative verifier errors
-		 * out. Only upper bounds can be learned because retval is an
-		 * int type and negative retvals are allowed.
-		 */
-		meta->msize_max_value = reg->umax_value;
-
-		/* The register is SCALAR_VALUE; the access check
-		 * happens using its boundaries.
-		 */
-		if (!tnum_is_const(reg->var_off))
-			/* For unprivileged variable accesses, disable raw
-			 * mode so that the program is required to
-			 * initialize all the memory that the helper could
-			 * just partially fill up.
-			 */
-			meta = NULL;
-
-		if (reg->smin_value < 0) {
-			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
-				regno);
-			return -EACCES;
-		}
-
-		if (reg->umin_value == 0) {
-			err = check_helper_mem_access(env, regno - 1, 0,
-						      zero_size_allowed,
-						      meta);
-			if (err)
-				return err;
-		}
-
-		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
-				regno);
-			return -EACCES;
-		}
-		err = check_helper_mem_access(env, regno - 1,
-					      reg->umax_value,
-					      zero_size_allowed, meta);
-		if (!err)
-			err = mark_chain_precision(env, regno);
+		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
 	} else if (arg_type_is_alloc_size(arg_type)) {
 		if (!tnum_is_const(reg->var_off)) {
 			verbose(env, "R%d is not a known constant'\n",
-- 
cgit v1.2.3


From 5c073f26f9dc78a6c8194b23eac7537c9692c7d7 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 14 Jan 2022 22:09:48 +0530
Subject: bpf: Add reference tracking support to kfunc

This patch adds verifier support for PTR_TO_BTF_ID return type of kfunc
to be a reference, by reusing acquire_reference_state/release_reference
support for existing in-kernel bpf helpers.

We make use of the three kfunc types:

- BTF_KFUNC_TYPE_ACQUIRE
  Return true if kfunc_btf_id is an acquire kfunc.  This will
  acquire_reference_state for the returned PTR_TO_BTF_ID (this is the
  only allow return value). Note that acquire kfunc must always return a
  PTR_TO_BTF_ID{_OR_NULL}, otherwise the program is rejected.

- BTF_KFUNC_TYPE_RELEASE
  Return true if kfunc_btf_id is a release kfunc.  This will release the
  reference to the passed in PTR_TO_BTF_ID which has a reference state
  (from earlier acquire kfunc).
  The btf_check_func_arg_match returns the regno (of argument register,
  hence > 0) if the kfunc is a release kfunc, and a proper referenced
  PTR_TO_BTF_ID is being passed to it.
  This is similar to how helper call check uses bpf_call_arg_meta to
  store the ref_obj_id that is later used to release the reference.
  Similar to in-kernel helper, we only allow passing one referenced
  PTR_TO_BTF_ID as an argument. It can also be passed in to normal
  kfunc, but in case of release kfunc there must always be one
  PTR_TO_BTF_ID argument that is referenced.

- BTF_KFUNC_TYPE_RET_NULL
  For kfunc returning PTR_TO_BTF_ID, tells if it can be NULL, hence
  force caller to mark the pointer not null (using check) before
  accessing it. Note that taking into account the case fixed by commit
  93c230e3f5bd ("bpf: Enforce id generation for all may-be-null register type")
  we assign a non-zero id for mark_ptr_or_null_reg logic. Later, if more
  return types are supported by kfunc, which have a _OR_NULL variant, it
  might be better to move this id generation under a common
  reg_type_may_be_null check, similar to the case in the commit.

Referenced PTR_TO_BTF_ID is currently only limited to kfunc, but can be
extended in the future to other BPF helpers as well.  For now, we can
rely on the btf_struct_ids_match check to ensure we get the pointer to
the expected struct type. In the future, care needs to be taken to avoid
ambiguity for reference PTR_TO_BTF_ID passed to release function, in
case multiple candidates can release same BTF ID.

e.g. there might be two release kfuncs (or kfunc and helper):

foo(struct abc *p);
bar(struct abc *p);

... such that both release a PTR_TO_BTF_ID with btf_id of struct abc. In
this case we would need to track the acquire function corresponding to
the release function to avoid type confusion, and store this information
in the register state so that an incorrect program can be rejected. This
is not a problem right now, hence it is left as an exercise for the
future patch introducing such a case in the kernel.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220114163953.1455836-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 32 +++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cf46694cb266..57f5fd5af2f9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5686,11 +5686,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    bool ptr_to_mem_ok)
 {
 	struct bpf_verifier_log *log = &env->log;
+	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	u32 i, nargs, ref_id;
+	int ref_regno = 0;
+	bool rel = false;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
@@ -5768,6 +5770,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (reg->type == PTR_TO_BTF_ID) {
 				reg_btf = reg->btf;
 				reg_ref_id = reg->btf_id;
+				/* Ensure only one argument is referenced PTR_TO_BTF_ID */
+				if (reg->ref_obj_id) {
+					if (ref_obj_id) {
+						bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+							regno, reg->ref_obj_id, ref_obj_id);
+						return -EFAULT;
+					}
+					ref_regno = regno;
+					ref_obj_id = reg->ref_obj_id;
+				}
 			} else {
 				reg_btf = btf_vmlinux;
 				reg_ref_id = *reg2btf_ids[reg->type];
@@ -5838,7 +5850,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		}
 	}
 
-	return 0;
+	/* Either both are set, or neither */
+	WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
+	if (is_kfunc) {
+		rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+						BTF_KFUNC_TYPE_RELEASE, func_id);
+		/* We already made sure ref_obj_id is set only for one argument */
+		if (rel && !ref_obj_id) {
+			bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+				func_name);
+			return -EINVAL;
+		}
+		/* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to
+		 * other kfuncs works
+		 */
+	}
+	/* returns argument register number > 0 in case of reference release kfunc */
+	return rel ? ref_regno : 0;
 }
 
 /* Compare BTF of a function with given bpf_reg_state.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2b186185b6b2..8c5a46d41f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
 {
 	return base_type(type) == PTR_TO_SOCKET ||
 		base_type(type) == PTR_TO_TCP_SOCK ||
-		base_type(type) == PTR_TO_MEM;
+		base_type(type) == PTR_TO_MEM ||
+		base_type(type) == PTR_TO_BTF_ID;
 }
 
 static bool type_is_rdonly_mem(u32 type)
@@ -3493,11 +3494,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 
 #define MAX_PACKET_OFF 0xffff
 
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
-	return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 				       const struct bpf_call_arg_meta *meta,
 				       enum bpf_access_type t)
@@ -6845,15 +6841,17 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
 	}
 }
 
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			    int *insn_idx_p)
 {
 	const struct btf_type *t, *func, *func_proto, *ptr_type;
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
 	u32 i, nargs, func_id, ptr_type_id;
+	int err, insn_idx = *insn_idx_p;
 	const struct btf_param *args;
 	struct btf *desc_btf;
-	int err;
+	bool acq;
 
 	/* skip for now, but return error when we find this in fixup_kfunc_call */
 	if (!insn->imm)
@@ -6875,16 +6873,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EACCES;
 	}
 
+	acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+					BTF_KFUNC_TYPE_ACQUIRE, func_id);
+
 	/* Check the arguments */
 	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
-	if (err)
+	if (err < 0)
 		return err;
+	/* In case of release function, we get register number of refcounted
+	 * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+	 */
+	if (err) {
+		err = release_reference(env, regs[err].ref_obj_id);
+		if (err) {
+			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+				func_name, func_id);
+			return err;
+		}
+	}
 
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
 	/* Check return type */
 	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+
+	if (acq && !btf_type_is_ptr(t)) {
+		verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+		return -EINVAL;
+	}
+
 	if (btf_type_is_scalar(t)) {
 		mark_reg_unknown(env, regs, BPF_REG_0);
 		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
@@ -6903,7 +6921,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		regs[BPF_REG_0].btf = desc_btf;
 		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
 		regs[BPF_REG_0].btf_id = ptr_type_id;
+		if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+					      BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+			regs[BPF_REG_0].id = ++env->id_gen;
+		}
 		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+		if (acq) {
+			int id = acquire_reference_state(env, insn_idx);
+
+			if (id < 0)
+				return id;
+			regs[BPF_REG_0].id = id;
+			regs[BPF_REG_0].ref_obj_id = id;
+		}
 	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
 
 	nargs = btf_type_vlen(func_proto);
@@ -11548,7 +11580,7 @@ static int do_check(struct bpf_verifier_env *env)
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
 				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
-					err = check_kfunc_call(env, insn);
+					err = check_kfunc_call(env, insn, &env->insn_idx);
 				else
 					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
-- 
cgit v1.2.3


From f10d059661968b01ef61a8b516775f95a18ab8ae Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 16 Dec 2021 02:04:25 +0000
Subject: bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean

Right now BPF_PROG_RUN_ARRAY and related macros return 1 or 0
for whether the prog array allows or rejects whatever is being
hooked. The caller of these macros then return -EPERM or continue
processing based on thw macro's return value. Unforunately this is
inflexible, since -EPERM is the only err that can be returned.

This patch should be a no-op; it prepares for the next patch. The
returning of the -EPERM is moved to inside the macros, so the outer
functions are directly returning what the macros returned if they
are non-zero.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/788abcdca55886d1f43274c918eaa9f792a9f33b.1639619851.git.zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 514b4681a90a..386155d279b3 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1080,7 +1080,6 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	} else {
 		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
 					    __bpf_prog_run_save_cb);
-		ret = (ret == 1 ? 0 : -EPERM);
 	}
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
@@ -1107,10 +1106,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
-	return ret == 1 ? 0 : -EPERM;
+	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
+				     bpf_prog_run);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1142,7 +1140,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	};
 	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
-	int ret;
 
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
@@ -1156,10 +1153,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
-				          bpf_prog_run, flags);
-
-	return ret == 1 ? 0 : -EPERM;
+	return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+					   bpf_prog_run, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1184,11 +1179,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum cgroup_bpf_attach_type atype)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	int ret;
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
-				    bpf_prog_run);
-	return ret == 1 ? 0 : -EPERM;
+	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+				     bpf_prog_run);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1201,15 +1194,15 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 		.major = major,
 		.minor = minor,
 	};
-	int allow;
+	int ret;
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				      bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+				    bpf_prog_run);
 	rcu_read_unlock();
 
-	return !allow;
+	return ret;
 }
 
 static const struct bpf_func_proto *
@@ -1350,7 +1343,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		kfree(ctx.new_val);
 	}
 
-	return ret == 1 ? 0 : -EPERM;
+	return ret;
 }
 
 #ifdef CONFIG_NET
@@ -1455,10 +1448,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
-	if (!ret) {
-		ret = -EPERM;
+	if (ret)
 		goto out;
-	}
 
 	if (ctx.optlen == -1) {
 		/* optlen set to -1, bypass kernel */
@@ -1565,10 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 				    &ctx, bpf_prog_run);
 	release_sock(sk);
 
-	if (!ret) {
-		ret = -EPERM;
+	if (ret)
 		goto out;
-	}
 
 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
 		ret = -EFAULT;
@@ -1624,8 +1613,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 
 	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
 				    &ctx, bpf_prog_run);
-	if (!ret)
-		return -EPERM;
+	if (ret)
+		return ret;
 
 	if (ctx.optlen > *optlen)
 		return -EFAULT;
-- 
cgit v1.2.3


From c4dcfdd406aa2167396ac215e351e5e4dfd7efe3 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 16 Dec 2021 02:04:26 +0000
Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx

The retval value is moved to struct bpf_cg_run_ctx for ease of access
in different prog types with different context structs layouts. The
helper implementation (to be added in a later patch in the series) can
simply perform a container_of from current->bpf_ctx to retrieve
bpf_cg_run_ctx.

Unfortunately, there is no easy way to access the current task_struct
via the verifier BPF bytecode rewrite, aside from possibly calling a
helper, so a pointer to current task is added to struct bpf_sockopt_kern
so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with
an indirection.

For backward compatibility, if a getsockopt program rejects a syscall
by returning 0, an -EPERM will be generated, by having the
BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to
-EPERM. Unlike prior to this patch, this -EPERM will be visible to
ctx->retval for any other hooks down the line in the prog array.

Additionally, the restriction that getsockopt filters can only set
the retval to 0 is removed, considering that certain getsockopt
implementations may return optlen. Filters are now able to set the
value arbitrarily.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 82 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 386155d279b3..b6fad0bbf5a7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1079,7 +1079,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 			cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
 	} else {
 		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
-					    __bpf_prog_run_save_cb);
+					    __bpf_prog_run_save_cb, 0);
 	}
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
@@ -1108,7 +1108,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
 	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
-				     bpf_prog_run);
+				     bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1154,7 +1154,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
-					   bpf_prog_run, flags);
+					   bpf_prog_run, 0, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1181,7 +1181,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
 	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
-				     bpf_prog_run);
+				     bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1199,7 +1199,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
 	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				    bpf_prog_run);
+				    bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	return ret;
@@ -1330,7 +1330,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+				    bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1445,7 +1446,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	lock_sock(sk);
 	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
-				    &ctx, bpf_prog_run);
+				    &ctx, bpf_prog_run, 0);
 	release_sock(sk);
 
 	if (ret)
@@ -1509,7 +1510,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		.sk = sk,
 		.level = level,
 		.optname = optname,
-		.retval = retval,
+		.current_task = current,
 	};
 	int ret;
 
@@ -1553,10 +1554,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	lock_sock(sk);
 	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
-				    &ctx, bpf_prog_run);
+				    &ctx, bpf_prog_run, retval);
 	release_sock(sk);
 
-	if (ret)
+	if (ret < 0)
 		goto out;
 
 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
@@ -1564,14 +1565,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		goto out;
 	}
 
-	/* BPF programs only allowed to set retval to 0, not some
-	 * arbitrary value.
-	 */
-	if (ctx.retval != 0 && ctx.retval != retval) {
-		ret = -EFAULT;
-		goto out;
-	}
-
 	if (ctx.optlen != 0) {
 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
 		    put_user(ctx.optlen, optlen)) {
@@ -1580,8 +1573,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		}
 	}
 
-	ret = ctx.retval;
-
 out:
 	sockopt_free_buf(&ctx, &buf);
 	return ret;
@@ -1596,10 +1587,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 		.sk = sk,
 		.level = level,
 		.optname = optname,
-		.retval = retval,
 		.optlen = *optlen,
 		.optval = optval,
 		.optval_end = optval + *optlen,
+		.current_task = current,
 	};
 	int ret;
 
@@ -1612,25 +1603,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 */
 
 	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
-				    &ctx, bpf_prog_run);
-	if (ret)
+				    &ctx, bpf_prog_run, retval);
+	if (ret < 0)
 		return ret;
 
 	if (ctx.optlen > *optlen)
 		return -EFAULT;
 
-	/* BPF programs only allowed to set retval to 0, not some
-	 * arbitrary value.
-	 */
-	if (ctx.retval != 0 && ctx.retval != retval)
-		return -EFAULT;
-
 	/* BPF programs can shrink the buffer, export the modifications.
 	 */
 	if (ctx.optlen != 0)
 		*optlen = ctx.optlen;
 
-	return ctx.retval;
+	return ret;
 }
 #endif
 
@@ -2046,10 +2031,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
 		break;
 	case offsetof(struct bpf_sockopt, retval):
-		if (type == BPF_WRITE)
-			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
-		else
-			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+		if (type == BPF_WRITE) {
+			int treg = BPF_REG_9;
+
+			if (si->src_reg == treg || si->dst_reg == treg)
+				--treg;
+			if (si->src_reg == treg || si->dst_reg == treg)
+				--treg;
+			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+					      offsetof(struct bpf_sockopt_kern, tmp_reg));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+					      treg, si->dst_reg,
+					      offsetof(struct bpf_sockopt_kern, current_task));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+					      treg, treg,
+					      offsetof(struct task_struct, bpf_ctx));
+			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+					      treg, si->src_reg,
+					      offsetof(struct bpf_cg_run_ctx, retval));
+			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+					      offsetof(struct bpf_sockopt_kern, tmp_reg));
+		} else {
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+					      si->dst_reg, si->src_reg,
+					      offsetof(struct bpf_sockopt_kern, current_task));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+					      si->dst_reg, si->dst_reg,
+					      offsetof(struct task_struct, bpf_ctx));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+					      si->dst_reg, si->dst_reg,
+					      offsetof(struct bpf_cg_run_ctx, retval));
+		}
 		break;
 	case offsetof(struct bpf_sockopt, optval):
 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
-- 
cgit v1.2.3


From b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 16 Dec 2021 02:04:27 +0000
Subject: bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall
 return value

The helpers continue to use int for retval because all the hooks
are int-returning rather than long-returning. The return value of
bpf_set_retval is int for future-proofing, in case in the future
there may be errors trying to set the retval.

After the previous patch, if a program rejects a syscall by
returning 0, an -EPERM will be generated no matter if the retval
is already set to -err. This patch change it being forced only if
retval is not -err. This is because we want to support, for
example, invoking bpf_set_retval(-EINVAL) and return 0, and have
the syscall return value be -EINVAL not -EPERM.

For BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY, the prior behavior is
that, if the return value is NET_XMIT_DROP, the packet is silently
dropped. We preserve this behavior for backward compatibility
reasons, so even if an errno is set, the errno does not return to
caller. However, setting a non-err to retval cannot propagate so
this is not allowed and we return a -EFAULT in that case.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/b4013fd5d16bed0b01977c1fafdeae12e1de61fb.1639619851.git.zhuyifei@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b6fad0bbf5a7..279ebbed75a5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
  *				  to call cwr
- *   -EPERM			- drop packet
+ *   -err			- drop packet
  *
  * For ingress packets, this function will return -EPERM if any
  * attached program was found and if it returned != 1 during execution.
@@ -1080,6 +1080,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	} else {
 		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
 					    __bpf_prog_run_save_cb, 0);
+		if (ret && !IS_ERR_VALUE((long)ret))
+			ret = -EFAULT;
 	}
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
@@ -1205,6 +1207,36 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	return ret;
 }
 
+BPF_CALL_0(bpf_get_retval)
+{
+	struct bpf_cg_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+	return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+	.func		= bpf_get_retval,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+	struct bpf_cg_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+	ctx->retval = retval;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_set_retval_proto = {
+	.func		= bpf_set_retval,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1217,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+	case BPF_FUNC_get_retval:
+		return &bpf_get_retval_proto;
+	case BPF_FUNC_set_retval:
+		return &bpf_set_retval_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 748cd5729ac7421091316e32dcdffb0578563880 Mon Sep 17 00:00:00 2001
From: Di Zhu <zhudi2@huawei.com>
Date: Wed, 19 Jan 2022 09:40:04 +0800
Subject: bpf: support BPF_PROG_QUERY for progs attached to sockmap

Right now there is no way to query whether BPF programs are
attached to a sockmap or not.

we can use the standard interface in libbpf to query, such as:
bpf_prog_query(mapFd, BPF_SK_SKB_STREAM_PARSER, 0, NULL, ...);
the mapFd is the fd of sockmap.

Signed-off-by: Di Zhu <zhudi2@huawei.com>
Acked-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20220119014005.1209-1-zhudi2@huawei.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fa4505f9b611..9e0631f091a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3318,6 +3318,11 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_FLOW_DISSECTOR:
 	case BPF_SK_LOOKUP:
 		return netns_bpf_prog_query(attr, uattr);
+	case BPF_SK_SKB_STREAM_PARSER:
+	case BPF_SK_SKB_STREAM_VERDICT:
+	case BPF_SK_MSG_VERDICT:
+	case BPF_SK_SKB_VERDICT:
+		return sock_map_bpf_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From c2f2cdbeffda7b153c19e0f3d73149c41026c0db Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 21 Jan 2022 11:09:52 +0100
Subject: bpf: introduce BPF_F_XDP_HAS_FRAGS flag in prog_flags loading the
 ebpf program

Introduce BPF_F_XDP_HAS_FRAGS and the related field in bpf_prog_aux
in order to notify the driver the loaded program support xdp frags.

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/db2e8075b7032a356003f407d1b0deb99adaa0ed.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9e0631f091a6..f29090643c6e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2217,7 +2217,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 				 BPF_F_ANY_ALIGNMENT |
 				 BPF_F_TEST_STATE_FREQ |
 				 BPF_F_SLEEPABLE |
-				 BPF_F_TEST_RND_HI32))
+				 BPF_F_TEST_RND_HI32 |
+				 BPF_F_XDP_HAS_FRAGS))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2303,6 +2304,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 	prog->aux->dst_prog = dst_prog;
 	prog->aux->offload_requested = !!attr->prog_ifindex;
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
-- 
cgit v1.2.3


From d99173027d6803430fd60e61aab3006644e18628 Mon Sep 17 00:00:00 2001
From: Eelco Chaudron <echaudro@redhat.com>
Date: Fri, 21 Jan 2022 11:09:56 +0100
Subject: bpf: add frags support to xdp copy helpers

This patch adds support for frags for the following helpers:
  - bpf_xdp_output()
  - bpf_perf_event_output()

Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/340b4a99cdc24337b40eaf8bb597f9f9e7b0373e.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..06a9e220069e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1562,6 +1562,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 
 extern const struct bpf_func_proto bpf_skb_output_proto;
 extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
 
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags)
@@ -1661,6 +1662,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_from_file_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_ptr_cookie_proto;
+	case BPF_FUNC_xdp_get_buff_len:
+		return &bpf_xdp_get_buff_len_trace_proto;
 #endif
 	case BPF_FUNC_seq_printf:
 		return prog->expected_attach_type == BPF_TRACE_ITER ?
-- 
cgit v1.2.3


From f45d5b6ce2e835834c94b8b700787984f02cd662 Mon Sep 17 00:00:00 2001
From: Toke Hoiland-Jorgensen <toke@redhat.com>
Date: Fri, 21 Jan 2022 11:10:02 +0100
Subject: bpf: generalise tail call map compatibility check

The check for tail call map compatibility ensures that tail calls only
happen between maps of the same type. To ensure backwards compatibility for
XDP frags we need a similar type of check for cpumap and devmap
programs, so move the state from bpf_array_aux into bpf_map, add
xdp_has_frags to the check, and apply the same check to cpumap and devmap.

Acked-by: John Fastabend <john.fastabend@gmail.com>
Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/f19fd97c0328a39927f3ad03e1ca6b43fd53cdfd.1642758637.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c |  4 +---
 kernel/bpf/core.c     | 28 ++++++++++++++--------------
 kernel/bpf/cpumap.c   |  8 +++++---
 kernel/bpf/devmap.c   |  3 ++-
 kernel/bpf/syscall.c  | 15 +++++++--------
 5 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c7a5be3bf8be..7f145aefbff8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 static void *prog_fd_array_get_ptr(struct bpf_map *map,
 				   struct file *map_file, int fd)
 {
-	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_prog *prog = bpf_prog_get(fd);
 
 	if (IS_ERR(prog))
 		return prog;
 
-	if (!bpf_prog_array_compatible(array, prog)) {
+	if (!bpf_prog_map_compatible(map, prog)) {
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}
@@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 	INIT_WORK(&aux->work, prog_array_map_clear_deferred);
 	INIT_LIST_HEAD(&aux->poke_progs);
 	mutex_init(&aux->poke_mutex);
-	spin_lock_init(&aux->owner.lock);
 
 	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..0a1cfd8544b9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1829,28 +1829,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
 }
 #endif
 
-bool bpf_prog_array_compatible(struct bpf_array *array,
-			       const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+			     const struct bpf_prog *fp)
 {
 	bool ret;
 
 	if (fp->kprobe_override)
 		return false;
 
-	spin_lock(&array->aux->owner.lock);
-
-	if (!array->aux->owner.type) {
+	spin_lock(&map->owner.lock);
+	if (!map->owner.type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
 		 */
-		array->aux->owner.type  = fp->type;
-		array->aux->owner.jited = fp->jited;
+		map->owner.type  = fp->type;
+		map->owner.jited = fp->jited;
+		map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
 		ret = true;
 	} else {
-		ret = array->aux->owner.type  == fp->type &&
-		      array->aux->owner.jited == fp->jited;
+		ret = map->owner.type  == fp->type &&
+		      map->owner.jited == fp->jited &&
+		      map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
 	}
-	spin_unlock(&array->aux->owner.lock);
+	spin_unlock(&map->owner.lock);
+
 	return ret;
 }
 
@@ -1862,13 +1864,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 	mutex_lock(&aux->used_maps_mutex);
 	for (i = 0; i < aux->used_map_cnt; i++) {
 		struct bpf_map *map = aux->used_maps[i];
-		struct bpf_array *array;
 
-		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		if (!map_type_contains_progs(map))
 			continue;
 
-		array = container_of(map, struct bpf_array, map);
-		if (!bpf_prog_array_compatible(array, fp)) {
+		if (!bpf_prog_map_compatible(map, fp)) {
 			ret = -EINVAL;
 			goto out;
 		}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b3e6b9422238..650e5d21f90d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data)
 	return 0;
 }
 
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+				      struct bpf_map *map, int fd)
 {
 	struct bpf_prog *prog;
 
@@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+	if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+	    !bpf_prog_map_compatible(map, prog)) {
 		bpf_prog_put(prog);
 		return -EINVAL;
 	}
@@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
 	rcpu->map_id = map->id;
 	rcpu->value.qsize  = value->qsize;
 
-	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+	if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
 		goto free_ptr_ring;
 
 	/* Setup kthread */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fe019dbdb3f0..038f6d7a83e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 					     BPF_PROG_TYPE_XDP, false);
 		if (IS_ERR(prog))
 			goto err_put_dev;
-		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+		if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+		    !bpf_prog_map_compatible(&dtab->map, prog))
 			goto err_put_prog;
 	}
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f29090643c6e..72ce1edde950 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -556,16 +556,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
 
 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
-	const struct bpf_map *map = filp->private_data;
-	const struct bpf_array *array;
+	struct bpf_map *map = filp->private_data;
 	u32 type = 0, jited = 0;
 
-	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
-		array = container_of(map, struct bpf_array, map);
-		spin_lock(&array->aux->owner.lock);
-		type  = array->aux->owner.type;
-		jited = array->aux->owner.jited;
-		spin_unlock(&array->aux->owner.lock);
+	if (map_type_contains_progs(map)) {
+		spin_lock(&map->owner.lock);
+		type  = map->owner.type;
+		jited = map->owner.jited;
+		spin_unlock(&map->owner.lock);
 	}
 
 	seq_printf(m,
@@ -874,6 +872,7 @@ static int map_create(union bpf_attr *attr)
 	atomic64_set(&map->refcnt, 1);
 	atomic64_set(&map->usercnt, 1);
 	mutex_init(&map->freeze_mutex);
+	spin_lock_init(&map->owner.lock);
 
 	map->spin_lock_off = -EINVAL;
 	map->timer_off = -EINVAL;
-- 
cgit v1.2.3


From b77fb25dcb342788d72ad7533163c34b8b823a1d Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Mon, 24 Jan 2022 10:54:00 -0800
Subject: bpf: Add support for bpf iterator programs to use sleepable helpers

This patch allows bpf iterator programs to use sleepable helpers by
changing `bpf_iter_run_prog` to use the appropriate synchronization.
With sleepable bpf iterator programs, we can no longer use
`rcu_read_lock()` and must use `rcu_read_lock_trace()` instead
to protect the bpf program.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220124185403.468466-2-kennyyu@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_iter.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b7aef5b3416d..110029ede71e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
 
 struct bpf_iter_target_info {
 	struct list_head list;
@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 {
 	int ret;
 
-	rcu_read_lock();
-	migrate_disable();
-	ret = bpf_prog_run(prog, ctx);
-	migrate_enable();
-	rcu_read_unlock();
+	if (prog->aux->sleepable) {
+		rcu_read_lock_trace();
+		migrate_disable();
+		might_fault();
+		ret = bpf_prog_run(prog, ctx);
+		migrate_enable();
+		rcu_read_unlock_trace();
+	} else {
+		rcu_read_lock();
+		migrate_disable();
+		ret = bpf_prog_run(prog, ctx);
+		migrate_enable();
+		rcu_read_unlock();
+	}
 
 	/* bpf program can only return 0 or 1:
 	 *  0 : okay
-- 
cgit v1.2.3


From 376040e47334c6dc6a939a32197acceb00fe4acf Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Mon, 24 Jan 2022 10:54:01 -0800
Subject: bpf: Add bpf_copy_from_user_task() helper

This adds a helper for bpf programs to read the memory of other
tasks.

As an example use case at Meta, we are using a bpf task iterator program
and this new helper to print C++ async stack traces for all threads of
a given process.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220124185403.468466-3-kennyyu@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c     | 34 ++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 2 files changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01cfdf40c838..ed2780b76cc1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/security.h>
+#include <linux/btf_ids.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+	   const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+	int ret;
+
+	/* flags is not used yet */
+	if (unlikely(flags))
+		return -EINVAL;
+
+	if (unlikely(!size))
+		return 0;
+
+	ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+	if (ret == size)
+		return 0;
+
+	memset(dst, 0, size);
+	/* Return -EFAULT for partial read */
+	return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+	.func		= bpf_copy_from_user_task,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_BTF_ID,
+	.arg4_btf_id	= &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+	.arg5_type	= ARG_ANYTHING
+};
+
 BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
 {
 	if (cpu >= nr_cpu_ids)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 06a9e220069e..a2024ba32a20 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_task_stack_proto;
 	case BPF_FUNC_copy_from_user:
 		return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+	case BPF_FUNC_copy_from_user_task:
+		return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
 	case BPF_FUNC_per_cpu_ptr:
-- 
cgit v1.2.3


From e204193b138af347fbbbe026e68cb3385112f387 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Mon, 13 Dec 2021 21:26:18 +0800
Subject: lockdep: Use memset_startat() helper in reinit_class()

use memset_startat() helper to simplify the code, there is no functional
change in this patch.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211213132618.105737-1-xiujianfeng@huawei.com
---
 kernel/locking/lockdep.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4a882f83aeb9..89b3df51fd98 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6011,13 +6011,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
 
 static void reinit_class(struct lock_class *class)
 {
-	void *const p = class;
-	const unsigned int offset = offsetof(struct lock_class, key);
-
 	WARN_ON_ONCE(!class->lock_entry.next);
 	WARN_ON_ONCE(!list_empty(&class->locks_after));
 	WARN_ON_ONCE(!list_empty(&class->locks_before));
-	memset(p + offset, 0, sizeof(*class) - offset);
+	memset_startat(class, 0, key);
 	WARN_ON_ONCE(!class->lock_entry.next);
 	WARN_ON_ONCE(!list_empty(&class->locks_after));
 	WARN_ON_ONCE(!list_empty(&class->locks_before));
-- 
cgit v1.2.3


From 61cc4534b6550997c97a03759ab46b29d44c0017 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sun, 2 Jan 2022 21:35:58 -0500
Subject: locking/lockdep: Avoid potential access of invalid memory in
 lock_class

It was found that reading /proc/lockdep after a lockdep splat may
potentially cause an access to freed memory if lockdep_unregister_key()
is called after the splat but before access to /proc/lockdep [1]. This
is due to the fact that graph_lock() call in lockdep_unregister_key()
fails after the clearing of debug_locks by the splat process.

After lockdep_unregister_key() is called, the lock_name may be freed
but the corresponding lock_class structure still have a reference to
it. That invalid memory pointer will then be accessed when /proc/lockdep
is read by a user and a use-after-free (UAF) error will be reported if
KASAN is enabled.

To fix this problem, lockdep_unregister_key() is now modified to always
search for a matching key irrespective of the debug_locks state and
zap the corresponding lock class if a matching one is found.

[1] https://lore.kernel.org/lkml/77f05c15-81b6-bddd-9650-80d5f23fe330@i-love.sakura.ne.jp/

Fixes: 8b39adbee805 ("locking/lockdep: Make lockdep_unregister_key() honor 'debug_locks' again")
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lkml.kernel.org/r/20220103023558.1377055-1-longman@redhat.com
---
 kernel/locking/lockdep.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 89b3df51fd98..2e6892ec3756 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -6287,7 +6287,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 		lockdep_reset_lock_reg(lock);
 }
 
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
 void lockdep_unregister_key(struct lock_class_key *key)
 {
 	struct hlist_head *hash_head = keyhashentry(key);
@@ -6302,10 +6308,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
 		return;
 
 	raw_local_irq_save(flags);
-	if (!graph_lock())
-		goto out_irq;
+	lockdep_lock();
 
-	pf = get_pending_free();
 	hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
 		if (k == key) {
 			hlist_del_rcu(&k->hash_entry);
@@ -6313,11 +6317,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
 			break;
 		}
 	}
-	WARN_ON_ONCE(!found);
-	__lockdep_free_key_range(pf, key, 1);
-	call_rcu_zapped(pf);
-	graph_unlock();
-out_irq:
+	WARN_ON_ONCE(!found && debug_locks);
+	if (found) {
+		pf = get_pending_free();
+		__lockdep_free_key_range(pf, key, 1);
+		call_rcu_zapped(pf);
+	}
+	lockdep_unlock();
 	raw_local_irq_restore(flags);
 
 	/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
-- 
cgit v1.2.3


From c446fdacb10dcb3b9a9ed3b91d91e72d71d94b03 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 25 Jan 2022 16:13:40 -0800
Subject: bpf: fix register_btf_kfunc_id_set for !CONFIG_DEBUG_INFO_BTF

Commit dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf")
breaks loading of some modules when CONFIG_DEBUG_INFO_BTF is not set.
register_btf_kfunc_id_set returns -ENOENT to the callers when
there is no module btf. Let's return 0 (success) instead to let
those modules work in !CONFIG_DEBUG_INFO_BTF cases.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Fixes: dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220126001340.1573649-1-sdf@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a1c44c17ea9c..b2a248956100 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6740,8 +6740,19 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 	int ret;
 
 	btf = btf_get_module_btf(kset->owner);
-	if (IS_ERR_OR_NULL(btf))
-		return btf ? PTR_ERR(btf) : -ENOENT;
+	if (!btf) {
+		if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+			pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+			return -ENOENT;
+		}
+		if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+			pr_err("missing module BTF, cannot register kfuncs\n");
+			return -ENOENT;
+		}
+		return 0;
+	}
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
 
 	hook = bpf_prog_type_to_kfunc_hook(prog_type);
 	ret = btf_populate_kfunc_set(btf, hook, kset);
-- 
cgit v1.2.3


From f244b4dc53e520d4570b2610436aba0593ce6f55 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 21 Jan 2022 18:36:28 +0530
Subject: printk: ringbuffer: Improve prb_next_seq() performance

prb_next_seq() always iterates from the first known sequence number.
In the worst case, it might loop 8k times for 256kB buffer,
15k times for 512kB buffer, and 64k times for 2MB buffer.

It was reported that polling and reading using syslog interface
might occupy 50% of CPU.

Speedup the search by storing @id of the last finalized descriptor.

The loop is still needed because the @id is stored and read in the best
effort way. An atomic variable is used to keep the @id consistent.
But the stores and reads are not serialized against each other.
The descriptor could get reused in the meantime. The related sequence
number will be used only when it is still valid.

An invalid value should be read _only_ when there is a flood of messages
and the ringbuffer is rapidly reused. The performance is the least
problem in this case.

Reported-by: Chunlei Wang <chunlei.wang@mediatek.com>
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/1642770388-17327-1-git-send-email-quic_mojha@quicinc.com
Link: https://lore.kernel.org/lkml/YXlddJxLh77DKfIO@alley/T/#m43062e8b2a17f8dbc8c6ccdb8851fb0dbaabbb14
---
 kernel/printk/printk_ringbuffer.c | 52 +++++++++++++++++++++++++++++++++++----
 kernel/printk/printk_ringbuffer.h |  2 ++
 2 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 8a7b7362c0dd..2b7b6ddab4f7 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
 	 * state has been re-checked. A memcpy() for all of @desc
 	 * cannot be used because of the atomic_t @state_var field.
 	 */
-	memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
-	       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+	if (desc_out) {
+		memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+		       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+	}
 	if (seq_out)
 		*seq_out = info->seq; /* also part of desc_read:C */
 	if (caller_id_out)
@@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
 	state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
 	d_state = get_desc_state(id, state_val);
 out:
-	atomic_long_set(&desc_out->state_var, state_val);
+	if (desc_out)
+		atomic_long_set(&desc_out->state_var, state_val);
 	return d_state;
 }
 
@@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
 
 	atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
 			DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+
+	/* Best effort to remember the last finalized @id. */
+	atomic_long_set(&desc_ring->last_finalized_id, id);
 }
 
 /**
@@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e)
  */
 void prb_final_commit(struct prb_reserved_entry *e)
 {
+	struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+
 	_prb_commit(e, desc_finalized);
+
+	/* Best effort to remember the last finalized @id. */
+	atomic_long_set(&desc_ring->last_finalized_id, e->id);
 }
 
 /*
@@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
  */
 u64 prb_next_seq(struct printk_ringbuffer *rb)
 {
-	u64 seq = 0;
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	enum desc_state d_state;
+	unsigned long id;
+	u64 seq;
+
+	/* Check if the cached @id still points to a valid @seq. */
+	id = atomic_long_read(&desc_ring->last_finalized_id);
+	d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
 
-	/* Search forward from the oldest descriptor. */
+	if (d_state == desc_finalized || d_state == desc_reusable) {
+		/*
+		 * Begin searching after the last finalized record.
+		 *
+		 * On 0, the search must begin at 0 because of hack#2
+		 * of the bootstrapping phase it is not known if a
+		 * record at index 0 exists.
+		 */
+		if (seq != 0)
+			seq++;
+	} else {
+		/*
+		 * The information about the last finalized sequence number
+		 * has gone. It should happen only when there is a flood of
+		 * new messages and the ringbuffer is rapidly recycled.
+		 * Give up and start from the beginning.
+		 */
+		seq = 0;
+	}
+
+	/*
+	 * The information about the last finalized @seq might be inaccurate.
+	 * Search forward to find the current one.
+	 */
 	while (_prb_read_valid(rb, &seq, NULL, NULL))
 		seq++;
 
@@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb,
 	rb->desc_ring.infos = infos;
 	atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
 	atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+	atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
 
 	rb->text_data_ring.size_bits = textbits;
 	rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 73cc80e01cef..18cd25e489b8 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,6 +75,7 @@ struct prb_desc_ring {
 	struct printk_info	*infos;
 	atomic_long_t		head_id;
 	atomic_long_t		tail_id;
+	atomic_long_t		last_finalized_id;
 };
 
 /*
@@ -258,6 +259,7 @@ static struct printk_ringbuffer name = {							\
 		.infos		= &_##name##_infos[0],						\
 		.head_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
 		.tail_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
+		.last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)),				\
 	},											\
 	.text_data_ring = {									\
 		.size_bits	= (avgtextbits) + (descbits),					\
-- 
cgit v1.2.3


From dfcf2e017f5bb928094952d5d56d3566d3d07ba7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 25 Jan 2022 16:20:01 +0300
Subject: swiotlb: do not zero buffer in set_memory_decrypted()

For larger TDX VM, memset() after set_memory_decrypted() in
swiotlb_update_mem_attributes() takes substantial portion of boot time.

Zeroing doesn't serve any functional purpose. Malicious VMM can mess
with decrypted/shared buffer at any point.

Remove the memset().

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..9390b38d2897 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -207,8 +207,6 @@ void __init swiotlb_update_mem_attributes(void)
 	mem->vaddr = swiotlb_mem_remap(mem, bytes);
 	if (!mem->vaddr)
 		mem->vaddr = vaddr;
-
-	memset(mem->vaddr, 0, bytes);
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
-- 
cgit v1.2.3


From 35265899acef135225e946b883fb07acba1d31a2 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:17 +0000
Subject: swiotlb: simplify debugfs setup

Debugfs functions are already stubbed out for !CONFIG_DEBUG_FS, so we
can remove most of the #ifdefs, just keeping one to manually optimise
away the initcall when it would do nothing. We can also simplify the
code itself by factoring out the directory creation and realising that
the global io_tlb_default_mem now makes debugfs_dir redundant.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9390b38d2897..f829259262fd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -36,9 +36,7 @@
 #include <linux/scatterlist.h>
 #include <linux/cc_platform.h>
 #include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
-#endif
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 #include <linux/io.h>
 #include <linux/of.h>
@@ -756,47 +754,29 @@ bool is_swiotlb_active(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+					 const char *dirname)
 {
+	mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+	if (!mem->nslabs)
+		return;
+
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
 	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
 }
 
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-
-	debugfs_dir = debugfs_create_dir("swiotlb", NULL);
-	if (mem->nslabs) {
-		mem->debugfs = debugfs_dir;
-		swiotlb_create_debugfs_files(mem);
-	}
+	swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_FS
 late_initcall(swiotlb_create_default_debugfs);
-
 #endif
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-	struct io_tlb_mem *mem = rmem->priv;
-
-	mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
-	swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
 struct page *swiotlb_alloc(struct device *dev, size_t size)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -858,7 +838,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 
 		rmem->priv = mem;
 
-		rmem_swiotlb_debugfs_init(rmem);
+		swiotlb_create_debugfs_files(mem, rmem->name);
 	}
 
 	dev->dma_io_tlb_mem = mem;
-- 
cgit v1.2.3


From c0a4191c27a12d3175283fa33f16db20e91008fd Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:18 +0000
Subject: swiotlb: tidy up includes

SWIOTLB's includes have become a great big mess. Restore some order by
consolidating the random different blocks, sorting alphabetically, and
purging some clearly unnecessary entries - linux/io.h is now included
unconditionally, so need not be duplicated in the restricted DMA pool
case; similarly, linux/io.h subsumes asm/io.h; and by now it's a
mystery why asm/dma.h was ever here at all.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f829259262fd..f3ff0af49f81 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,38 +21,33 @@
 #define pr_fmt(fmt) "software IO TLB: " fmt
 
 #include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-map-ops.h>
-#include <linux/mm.h>
 #include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/swiotlb.h>
-#include <linux/pfn.h>
 #include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#include <linux/debugfs.h>
 #ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/of_reserved_mem.h>
 #include <linux/slab.h>
 #endif
 
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
 #define CREATE_TRACE_POINTS
 #include <trace/events/swiotlb.h>
 
-- 
cgit v1.2.3


From 404f9373c4e5c943ed8a5e71c8dcfef9eddd54ab Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:19 +0000
Subject: swiotlb: simplify array allocation

Prefer kcalloc() to kzalloc(array_size()) for allocating an array.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f3ff0af49f81..908eac2527cb 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -818,8 +818,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 		if (!mem)
 			return -ENOMEM;
 
-		mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
-				     GFP_KERNEL);
+		mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
 		if (!mem->slots) {
 			kfree(mem);
 			return -ENOMEM;
-- 
cgit v1.2.3


From 28c988c3ec29db74a1dda631b18785958d57df4f Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@amd.com>
Date: Tue, 18 Jan 2022 10:35:15 +0530
Subject: sched/debug: Remove mpol_get/put and task_lock/unlock from
 sched_show_numa

The older format of /proc/pid/sched printed home node info which
required the mempolicy and task lock around mpol_get(). However
the format has changed since then and there is no need for
sched_show_numa() any more to have mempolicy argument,
asssociated mpol_get/put and task_lock/unlock. Remove them.

Fixes: 397f2378f1361 ("sched/numa: Fix numa balancing stats in /proc/pid/sched")
Signed-off-by: Bharata B Rao <bharata@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20220118050515.2973-1-bharata@amd.com
---
 kernel/sched/debug.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..102d6f70e84d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -931,25 +931,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
 static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 {
 #ifdef CONFIG_NUMA_BALANCING
-	struct mempolicy *pol;
-
 	if (p->mm)
 		P(mm->numa_scan_seq);
 
-	task_lock(p);
-	pol = p->mempolicy;
-	if (pol && !(pol->flags & MPOL_F_MORON))
-		pol = NULL;
-	mpol_get(pol);
-	task_unlock(p);
-
 	P(numa_pages_migrated);
 	P(numa_preferred_nid);
 	P(total_numa_faults);
 	SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
 			task_node(p), task_numa_group_id(p));
 	show_numa_stats(p, m);
-	mpol_put(pol);
 #endif
 }
 
-- 
cgit v1.2.3


From 12bf8a7eb84e4d3547ebfd89bb0a9255a0f2acc7 Mon Sep 17 00:00:00 2001
From: Honglei Wang <wanghonglei@didichuxing.com>
Date: Thu, 13 Jan 2022 21:39:20 +0800
Subject: sched/numa: initialize numa statistics when forking new task

The child processes will inherit numa_pages_migrated and
total_numa_faults from the parent. It means even if there is no numa
fault happen on the child, the statistics in /proc/$pid of the child
process might show huge amount. This is a bit weird. Let's initialize
them when do fork.

Signed-off-by: Honglei Wang <wanghonglei@didichuxing.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20220113133920.49900-1-wanghonglei@didichuxing.com
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 095b0aa378df..5dca13ff89f2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2825,6 +2825,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	/* Protect against double add, see task_tick_numa and task_numa_work */
 	p->numa_work.next		= &p->numa_work;
 	p->numa_faults			= NULL;
+	p->numa_pages_migrated		= 0;
+	p->total_numa_faults		= 0;
 	RCU_INIT_POINTER(p->numa_group, NULL);
 	p->last_task_numa_placement	= 0;
 	p->last_sum_exec_runtime	= 0;
-- 
cgit v1.2.3


From 77cf151b7bbdfa3577b3c3f3a5e267a6c60a263b Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Thu, 28 Oct 2021 12:50:05 +0100
Subject: sched/core: Export pelt_thermal_tp

We can't use this tracepoint in modules without having the symbol
exported first, fix that.

Fixes: 765047932f15 ("sched/pelt: Add support to track thermal pressure")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211028115005.873539-1-qais.yousef@arm.com
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e4ae00e52d1..1d863d7f6ad7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
-- 
cgit v1.2.3


From 7a17e1db1265471f7718af100cfc5e41280d53a7 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Thu, 16 Dec 2021 22:53:19 +0000
Subject: sched/sugov: Ignore 'busy' filter when rq is capped by uclamp_max

sugov_update_single_{freq, perf}() contains a 'busy' filter that ensures
we don't bring the frqeuency down if there's no idle time (CPU is busy).

The problem is that with uclamp_max we will have scenarios where a busy
task is capped to run at a lower frequency and this filter prevents
applying the capping when this task starts running.

We handle this by skipping the filter when uclamp is enabled and the rq
is being capped by uclamp_max.

We introduce a new function uclamp_rq_is_capped() to help detecting when
this capping is taking effect. Some code shuffling was required to allow
using cpu_util_{cfs, rt}() in this new function.

On 2 Core SMT2 Intel laptop I see:

Without this patch:

	uclampset -M 0 sysbench --test=cpu --threads = 4 run

produces a score of ~3200 consistently. Which is the highest possible.

Compiling the kernel also results in frequency running at max 3.1GHz all
the time - running uclampset -M 400 to cap it has no effect without this
patch.

With this patch:

	uclampset -M 0 sysbench --test=cpu --threads = 4 run

produces a score of ~1100 with some outliers in ~1700. Uclamp max
aggregates the performance requirements, so having high values sometimes
is expected if some other task happens to require that frequency starts
running at the same time.

When compiling the kernel with uclampset -M 400 I can see the
frequencies mostly in the ~2GHz region. Helpful to conserve power and
prevent heating when not plugged in.

Fixes: 982d9cdc22c9 ("sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20211216225320.2957053-2-qais.yousef@arm.com
---
 kernel/sched/cpufreq_schedutil.c |  10 ++-
 kernel/sched/sched.h             | 181 +++++++++++++++++++++------------------
 2 files changed, 107 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 26778884d9ab..62d98b09aaa5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -348,8 +348,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
+	 *
+	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
 		next_f = sg_policy->next_freq;
 
 		/* Restore cached freq as next_freq has changed */
@@ -395,8 +398,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the target performance level if the CPU has not been
 	 * idle recently, as the reduction is likely to be premature then.
+	 *
+	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..9b33ba9c3c42 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2841,88 +2841,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
-#ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
-
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq:		The rq to clamp against. Must not be NULL.
- * @util:	The util value to clamp.
- * @p:		The task to clamp against. Can be NULL if you want to clamp
- *		against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
-{
-	unsigned long min_util = 0;
-	unsigned long max_util = 0;
-
-	if (!static_branch_likely(&sched_uclamp_used))
-		return util;
-
-	if (p) {
-		min_util = uclamp_eff_value(p, UCLAMP_MIN);
-		max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
-		/*
-		 * Ignore last runnable task's max clamp, as this task will
-		 * reset it. Similarly, no need to read the rq's min clamp.
-		 */
-		if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
-			goto out;
-	}
-
-	min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
-	max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
-out:
-	/*
-	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
-	 * RUNNABLE tasks with _different_ clamps, we can end up with an
-	 * inversion. Fix it now when the clamps are applied.
-	 */
-	if (unlikely(min_util >= max_util))
-		return min_util;
-
-	return clamp(util, min_util, max_util);
-}
-
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
-	return static_branch_likely(&sched_uclamp_used);
-}
-#else /* CONFIG_UCLAMP_TASK */
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
-{
-	return util;
-}
-
-static inline bool uclamp_is_used(void)
-{
-	return false;
-}
-#endif /* CONFIG_UCLAMP_TASK */
-
 #ifdef arch_scale_freq_capacity
 # ifndef arch_scale_freq_invariant
 #  define arch_scale_freq_invariant()	true
@@ -3020,6 +2938,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_UCLAMP_TASK
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+/**
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq:		The rq to clamp against. Must not be NULL.
+ * @util:	The util value to clamp.
+ * @p:		The task to clamp against. Can be NULL if you want to clamp
+ *		against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
+static __always_inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+				  struct task_struct *p)
+{
+	unsigned long min_util = 0;
+	unsigned long max_util = 0;
+
+	if (!static_branch_likely(&sched_uclamp_used))
+		return util;
+
+	if (p) {
+		min_util = uclamp_eff_value(p, UCLAMP_MIN);
+		max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+		/*
+		 * Ignore last runnable task's max clamp, as this task will
+		 * reset it. Similarly, no need to read the rq's min clamp.
+		 */
+		if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+			goto out;
+	}
+
+	min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+	max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
+	/*
+	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
+	 * RUNNABLE tasks with _different_ clamps, we can end up with an
+	 * inversion. Fix it now when the clamps are applied.
+	 */
+	if (unlikely(min_util >= max_util))
+		return min_util;
+
+	return clamp(util, min_util, max_util);
+}
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+	unsigned long rq_util;
+	unsigned long max_util;
+
+	if (!static_branch_likely(&sched_uclamp_used))
+		return false;
+
+	rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+	max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+	return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+	return static_branch_likely(&sched_uclamp_used);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+				  struct task_struct *p)
+{
+	return util;
+}
+
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
+{
+	return false;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 static inline unsigned long cpu_util_irq(struct rq *rq)
 {
-- 
cgit v1.2.3


From d37aee9018e68b0d356195caefbb651910e0bbfa Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Thu, 16 Dec 2021 22:53:20 +0000
Subject: sched/uclamp: Fix iowait boost escaping uclamp restriction

iowait_boost signal is applied independently of util and doesn't take
into account uclamp settings of the rq. An io heavy task that is capped
by uclamp_max could still request higher frequency because
sugov_iowait_apply() doesn't clamp the boost via uclamp_rq_util_with()
like effective_cpu_util() does.

Make sure that iowait_boost honours uclamp requests by calling
uclamp_rq_util_with() when applying the boost.

Fixes: 982d9cdc22c9 ("sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20211216225320.2957053-3-qais.yousef@arm.com
---
 kernel/sched/cpufreq_schedutil.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 62d98b09aaa5..6d65ab6e484e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -289,6 +289,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 	 * into the same scale so we can compare.
 	 */
 	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
 	if (sg_cpu->util < boost)
 		sg_cpu->util = boost;
 }
-- 
cgit v1.2.3


From 5102bb1c9f82857a3164af9d7ab7ad628cb783ed Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 19 Jan 2022 14:39:40 -0800
Subject: psi: Fix "defined but not used" warnings when CONFIG_PROC_FS=n

When CONFIG_PROC_FS is disabled psi code generates the following warnings:

kernel/sched/psi.c:1364:30: warning: 'psi_cpu_proc_ops' defined but not used [-Wunused-const-variable=]
    1364 | static const struct proc_ops psi_cpu_proc_ops = {
         |                              ^~~~~~~~~~~~~~~~
kernel/sched/psi.c:1355:30: warning: 'psi_memory_proc_ops' defined but not used [-Wunused-const-variable=]
    1355 | static const struct proc_ops psi_memory_proc_ops = {
         |                              ^~~~~~~~~~~~~~~~~~~
kernel/sched/psi.c:1346:30: warning: 'psi_io_proc_ops' defined but not used [-Wunused-const-variable=]
    1346 | static const struct proc_ops psi_io_proc_ops = {
         |                              ^~~~~~~~~~~~~~~

Make definitions of these structures and related functions conditional on
CONFIG_PROC_FS config.

Fixes: 0e94682b73bf ("psi: introduce psi monitor")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220119223940.787748-3-surenb@google.com
---
 kernel/sched/psi.c | 79 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a679613a7cb7..cfe76f704d8a 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	return 0;
 }
 
-static int psi_io_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
-	return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
-	if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	return single_open(file, psi_show, NULL);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
-	return psi_open(file, psi_io_show);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
-	return psi_open(file, psi_memory_show);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
-	return psi_open(file, psi_cpu_show);
-}
-
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
 			char *buf, size_t nbytes, enum psi_res res)
 {
@@ -1296,6 +1258,45 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
 	return ret;
 }
 
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+	return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
+{
+	if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	return single_open(file, psi_show, NULL);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+	return psi_open(file, psi_io_show);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+	return psi_open(file, psi_memory_show);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+	return psi_open(file, psi_cpu_show);
+}
+
 static ssize_t psi_write(struct file *file, const char __user *user_buf,
 			 size_t nbytes, enum psi_res res)
 {
@@ -1400,3 +1401,5 @@ static int __init psi_proc_init(void)
 	return 0;
 }
 module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
-- 
cgit v1.2.3


From 46531a30364bd483bfa1b041c15d42a196e77e93 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 27 Jan 2022 14:09:13 +0000
Subject: cgroup/bpf: fast path skb BPF filtering

Even though there is a static key protecting from overhead from
cgroup-bpf skb filtering when there is nothing attached, in many cases
it's not enough as registering a filter for one type will ruin the fast
path for all others. It's observed in production servers I've looked
at but also in laptops, where registration is done during init by
systemd or something else.

Add a per-socket fast path check guarding from such overhead. This
affects both receive and transmit paths of TCP, UDP and other
protocols. It showed ~1% tx/s improvement in small payload UDP
send benchmarks using a real NIC and in a server environment and the
number jumps to 2-3% for preemtible kernels.

Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/d8c58857113185a764927a46f4b5a058d36d3ec3.1643292455.git.asml.silence@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 30 ------------------------------
 kernel/bpf/core.c   | 16 ++++------------
 2 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 279ebbed75a5..098632fdbc45 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1384,20 +1384,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 }
 
 #ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
-					     enum cgroup_bpf_attach_type attach_type)
-{
-	struct bpf_prog_array *prog_array;
-	bool empty;
-
-	rcu_read_lock();
-	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
-	empty = bpf_prog_array_is_empty(prog_array);
-	rcu_read_unlock();
-
-	return empty;
-}
-
 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
 			     struct bpf_sockopt_buf *buf)
 {
@@ -1456,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	};
 	int ret, max_optlen;
 
-	/* Opportunistic check to see whether we have any BPF program
-	 * attached to the hook so we don't waste time allocating
-	 * memory and locking the socket.
-	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
-		return 0;
-
 	/* Allocate a bit more than the initial user buffer for
 	 * BPF program. The canonical use case is overriding
 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
 	 */
 	max_optlen = max_t(int, 16, *optlen);
-
 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
@@ -1550,15 +1528,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	};
 	int ret;
 
-	/* Opportunistic check to see whether we have any BPF program
-	 * attached to the hook so we don't waste time allocating
-	 * memory and locking the socket.
-	 */
-	if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
-		return retval;
-
 	ctx.optlen = max_optlen;
-
 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
 	if (max_optlen < 0)
 		return max_optlen;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0a1cfd8544b9..04a8d5bea552 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1968,18 +1968,10 @@ static struct bpf_prog_dummy {
 	},
 };
 
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
-	struct bpf_prog_array hdr;
-	struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
 	.null_prog = NULL,
 };
+EXPORT_SYMBOL(bpf_empty_prog_array);
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 {
@@ -1989,12 +1981,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
 			       (prog_cnt + 1),
 			       flags);
 
-	return &empty_prog_array.hdr;
+	return &bpf_empty_prog_array.hdr;
 }
 
 void bpf_prog_array_free(struct bpf_prog_array *progs)
 {
-	if (!progs || progs == &empty_prog_array.hdr)
+	if (!progs || progs == &bpf_empty_prog_array.hdr)
 		return;
 	kfree_rcu(progs, rcu);
 }
-- 
cgit v1.2.3


From c6f1bfe89ac95dc829dcb4ed54780da134ac5fce Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 27 Jan 2022 07:46:06 -0800
Subject: bpf: reject program if a __user tagged memory accessed in kernel way

BPF verifier supports direct memory access for BPF_PROG_TYPE_TRACING type
of bpf programs, e.g., a->b. If "a" is a pointer
pointing to kernel memory, bpf verifier will allow user to write
code in C like a->b and the verifier will translate it to a kernel
load properly. If "a" is a pointer to user memory, it is expected
that bpf developer should be bpf_probe_read_user() helper to
get the value a->b. Without utilizing BTF __user tagging information,
current verifier will assume that a->b is a kernel memory access
and this may generate incorrect result.

Now BTF contains __user information, it can check whether the
pointer points to a user memory or not. If it is, the verifier
can reject the program and force users to use bpf_probe_read_user()
helper explicitly.

In the future, we can easily extend btf_add_space for other
address space tagging, for example, rcu/percpu etc.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220127154606.654961-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 34 ++++++++++++++++++++++++++++------
 kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 52 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b2a248956100..b983cee8d196 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4886,6 +4886,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	const char *tname = prog->aux->attach_func_name;
 	struct bpf_verifier_log *log = info->log;
 	const struct btf_param *args;
+	const char *tag_value;
 	u32 nr_args, arg;
 	int i, ret;
 
@@ -5038,6 +5039,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	info->btf = btf;
 	info->btf_id = t->type;
 	t = btf_type_by_id(btf, t->type);
+
+	if (btf_type_is_type_tag(t)) {
+		tag_value = __btf_name_by_offset(btf, t->name_off);
+		if (strcmp(tag_value, "user") == 0)
+			info->reg_type |= MEM_USER;
+	}
+
 	/* skip modifiers */
 	while (btf_type_is_modifier(t)) {
 		info->btf_id = t->type;
@@ -5064,12 +5072,12 @@ enum bpf_struct_walk_result {
 
 static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 			   const struct btf_type *t, int off, int size,
-			   u32 *next_btf_id)
+			   u32 *next_btf_id, enum bpf_type_flag *flag)
 {
 	u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
 	const struct btf_type *mtype, *elem_type = NULL;
 	const struct btf_member *member;
-	const char *tname, *mname;
+	const char *tname, *mname, *tag_value;
 	u32 vlen, elem_id, mid;
 
 again:
@@ -5253,7 +5261,8 @@ error:
 		}
 
 		if (btf_type_is_ptr(mtype)) {
-			const struct btf_type *stype;
+			const struct btf_type *stype, *t;
+			enum bpf_type_flag tmp_flag = 0;
 			u32 id;
 
 			if (msize != size || off != moff) {
@@ -5262,9 +5271,19 @@ error:
 					mname, moff, tname, off, size);
 				return -EACCES;
 			}
+
+			/* check __user tag */
+			t = btf_type_by_id(btf, mtype->type);
+			if (btf_type_is_type_tag(t)) {
+				tag_value = __btf_name_by_offset(btf, t->name_off);
+				if (strcmp(tag_value, "user") == 0)
+					tmp_flag = MEM_USER;
+			}
+
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
+				*flag = tmp_flag;
 				return WALK_PTR;
 			}
 		}
@@ -5291,13 +5310,14 @@ error:
 int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype __maybe_unused,
-		      u32 *next_btf_id)
+		      u32 *next_btf_id, enum bpf_type_flag *flag)
 {
+	enum bpf_type_flag tmp_flag = 0;
 	int err;
 	u32 id;
 
 	do {
-		err = btf_struct_walk(log, btf, t, off, size, &id);
+		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
 
 		switch (err) {
 		case WALK_PTR:
@@ -5305,6 +5325,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
 			 * we're done.
 			 */
 			*next_btf_id = id;
+			*flag = tmp_flag;
 			return PTR_TO_BTF_ID;
 		case WALK_SCALAR:
 			return SCALAR_VALUE;
@@ -5349,6 +5370,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *need_btf, u32 need_type_id)
 {
 	const struct btf_type *type;
+	enum bpf_type_flag flag;
 	int err;
 
 	/* Are we already done? */
@@ -5359,7 +5381,7 @@ again:
 	type = btf_type_by_id(btf, id);
 	if (!type)
 		return false;
-	err = btf_struct_walk(log, btf, type, off, 1, &id);
+	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
 	if (err != WALK_STRUCT)
 		return false;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dcf065ec2774..1ae41d0cf96c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -536,7 +536,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
 static const char *reg_type_str(struct bpf_verifier_env *env,
 				enum bpf_reg_type type)
 {
-	char postfix[16] = {0}, prefix[16] = {0};
+	char postfix[16] = {0}, prefix[32] = {0};
 	static const char * const str[] = {
 		[NOT_INIT]		= "?",
 		[SCALAR_VALUE]		= "inv",
@@ -570,9 +570,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 	}
 
 	if (type & MEM_RDONLY)
-		strncpy(prefix, "rdonly_", 16);
+		strncpy(prefix, "rdonly_", 32);
 	if (type & MEM_ALLOC)
-		strncpy(prefix, "alloc_", 16);
+		strncpy(prefix, "alloc_", 32);
+	if (type & MEM_USER)
+		strncpy(prefix, "user_", 32);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
@@ -1547,14 +1549,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
 static void mark_btf_ld_reg(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *regs, u32 regno,
 			    enum bpf_reg_type reg_type,
-			    struct btf *btf, u32 btf_id)
+			    struct btf *btf, u32 btf_id,
+			    enum bpf_type_flag flag)
 {
 	if (reg_type == SCALAR_VALUE) {
 		mark_reg_unknown(env, regs, regno);
 		return;
 	}
 	mark_reg_known_zero(env, regs, regno);
-	regs[regno].type = PTR_TO_BTF_ID;
+	regs[regno].type = PTR_TO_BTF_ID | flag;
 	regs[regno].btf = btf;
 	regs[regno].btf_id = btf_id;
 }
@@ -4152,6 +4155,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = regs + regno;
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+	enum bpf_type_flag flag = 0;
 	u32 btf_id;
 	int ret;
 
@@ -4171,9 +4175,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	if (reg->type & MEM_USER) {
+		verbose(env,
+			"R%d is ptr_%s access user memory: off=%d\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	if (env->ops->btf_struct_access) {
 		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
-						  off, size, atype, &btf_id);
+						  off, size, atype, &btf_id, &flag);
 	} else {
 		if (atype != BPF_READ) {
 			verbose(env, "only read is supported\n");
@@ -4181,14 +4192,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 
 		ret = btf_struct_access(&env->log, reg->btf, t, off, size,
-					atype, &btf_id);
+					atype, &btf_id, &flag);
 	}
 
 	if (ret < 0)
 		return ret;
 
 	if (atype == BPF_READ && value_regno >= 0)
-		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
 	return 0;
 }
@@ -4201,6 +4212,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 {
 	struct bpf_reg_state *reg = regs + regno;
 	struct bpf_map *map = reg->map_ptr;
+	enum bpf_type_flag flag = 0;
 	const struct btf_type *t;
 	const char *tname;
 	u32 btf_id;
@@ -4238,12 +4250,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+	ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
 	if (ret < 0)
 		return ret;
 
 	if (value_regno >= 0)
-		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+		mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
 
 	return 0;
 }
@@ -4444,7 +4456,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err < 0)
 			return err;
 
-		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+				       &btf_id);
 		if (err)
 			verbose_linfo(env, insn_idx, "; ");
 		if (!err && t == BPF_READ && value_regno >= 0) {
-- 
cgit v1.2.3


From 1c4cafd11599abdbc53a520f0b6e6799d037eae1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 23 Jan 2022 10:38:52 -0800
Subject: padata: replace cpumask_weight with cpumask_empty in padata.c

padata_do_parallel() calls cpumask_weight() to check if any bit of a
given cpumask is set. We can do it more efficiently with cpumask_empty()
because cpumask_empty() stops traversing the cpumask as soon as it finds
first set bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 18d3a5c699d8..e5819bb8bd1d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps,
 		goto out;
 
 	if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
-		if (!cpumask_weight(pd->cpumask.cbcpu))
+		if (cpumask_empty(pd->cpumask.cbcpu))
 			goto out;
 
 		/* Select an alternate fallback CPU and notify the caller. */
-- 
cgit v1.2.3


From 0407a65f356e6d9340ad673907c17e52fade43e3 Mon Sep 17 00:00:00 2001
From: Kenta Tada <Kenta.Tada@sony.com>
Date: Sat, 29 Jan 2022 02:09:06 +0900
Subject: bpf: make bpf_copy_from_user_task() gpl only

access_process_vm() is exported by EXPORT_SYMBOL_GPL().

Signed-off-by: Kenta Tada <Kenta.Tada@sony.com>
Link: https://lore.kernel.org/r/20220128170906.21154-1-Kenta.Tada@sony.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ed2780b76cc1..4e5969fde0b3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -695,7 +695,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
 
 const struct bpf_func_proto bpf_copy_from_user_task_proto = {
 	.func		= bpf_copy_from_user_task,
-	.gpl_only	= false,
+	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-- 
cgit v1.2.3


From 1ddbddd7065182c10c7c50ba6daf890edfdf7377 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 31 Jan 2022 14:05:25 -0800
Subject: bpf: Remove unnecessary setrlimit from bpf preload.

BPF programs and maps are memcg accounted. setrlimit is obsolete.
Remove its use from bpf preload.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220131220528.98088-5-alexei.starovoitov@gmail.com
---
 kernel/bpf/preload/iterators/iterators.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
index 5d872a705470..2ec85fc6984f 100644
--- a/kernel/bpf/preload/iterators/iterators.c
+++ b/kernel/bpf/preload/iterators/iterators.c
@@ -37,7 +37,6 @@ static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
 
 int main(int argc, char **argv)
 {
-	struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
 	struct iterators_bpf *skel;
 	int err, magic;
 	int debug_fd;
@@ -55,7 +54,6 @@ int main(int argc, char **argv)
 		printf("bad start magic %d\n", magic);
 		return 1;
 	}
-	setrlimit(RLIMIT_MEMLOCK, &rlim);
 	/* libbpf opens BPF object and loads it into the kernel */
 	skel = iterators_bpf__open_and_load();
 	if (!skel) {
-- 
cgit v1.2.3


From 79b203926d18cb8c110564d8a09ff646691de9e7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 31 Jan 2022 14:05:26 -0800
Subject: bpf: Convert bpf preload to light skeleton.

Convert bpffs preload iterators to light skeleton.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220131220528.98088-6-alexei.starovoitov@gmail.com
---
 kernel/bpf/preload/iterators/Makefile          |   6 +-
 kernel/bpf/preload/iterators/iterators.c       |  10 +-
 kernel/bpf/preload/iterators/iterators.lskel.h | 428 +++++++++++++++++++++++++
 kernel/bpf/preload/iterators/iterators.skel.h  | 412 ------------------------
 4 files changed, 436 insertions(+), 420 deletions(-)
 create mode 100644 kernel/bpf/preload/iterators/iterators.lskel.h
 delete mode 100644 kernel/bpf/preload/iterators/iterators.skel.h

(limited to 'kernel')

diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index b8bd60511227..bfe24f8c5a20 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,15 +35,15 @@ endif
 
 .PHONY: all clean
 
-all: iterators.skel.h
+all: iterators.lskel.h
 
 clean:
 	$(call msg,CLEAN)
 	$(Q)rm -rf $(OUTPUT) iterators
 
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
 	$(call msg,GEN-SKEL,$@)
-	$(Q)$(BPFTOOL) gen skeleton $< > $@
+	$(Q)$(BPFTOOL) gen skeleton -L $< > $@
 
 
 $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
index 2ec85fc6984f..23b74916fb84 100644
--- a/kernel/bpf/preload/iterators/iterators.c
+++ b/kernel/bpf/preload/iterators/iterators.c
@@ -10,20 +10,20 @@
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
 #include <sys/mount.h>
-#include "iterators.skel.h"
+#include "iterators.lskel.h"
 #include "bpf_preload_common.h"
 
 int to_kernel = -1;
 int from_kernel = 0;
 
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
+static int send_link_to_kernel(int link_fd, const char *link_name)
 {
 	struct bpf_preload_info obj = {};
 	struct bpf_link_info info = {};
 	__u32 info_len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+	err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
 	if (err)
 		return err;
 	obj.link_id = info.id;
@@ -70,10 +70,10 @@ int main(int argc, char **argv)
 		goto cleanup;
 
 	/* send two bpf_link IDs with names to the kernel */
-	err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
+	err = send_link_to_kernel(skel->links.dump_bpf_map_fd, "maps.debug");
 	if (err)
 		goto cleanup;
-	err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
+	err = send_link_to_kernel(skel->links.dump_bpf_prog_fd, "progs.debug");
 	if (err)
 		goto cleanup;
 
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
new file mode 100644
index 000000000000..d90562d672d2
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel.h
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <stdlib.h>
+#include <bpf/bpf.h>
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+	struct bpf_loader_ctx ctx;
+	struct {
+		struct bpf_map_desc rodata;
+	} maps;
+	struct {
+		struct bpf_prog_desc dump_bpf_map;
+		struct bpf_prog_desc dump_bpf_prog;
+	} progs;
+	struct {
+		int dump_bpf_map_fd;
+		int dump_bpf_prog_fd;
+	} links;
+	struct iterators_bpf__rodata {
+	} *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+	int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+	int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+	if (fd > 0)
+		skel->links.dump_bpf_map_fd = fd;
+	return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+	int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+	int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+	if (fd > 0)
+		skel->links.dump_bpf_prog_fd = fd;
+	return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+	int ret = 0;
+
+	ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+	ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+	return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+	skel_closenz(skel->links.dump_bpf_map_fd);
+	skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+	if (!skel)
+		return;
+	iterators_bpf__detach(skel);
+	skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+	skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+	munmap(skel->rodata, 4096);
+	skel_closenz(skel->maps.rodata.map_fd);
+	free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+	struct iterators_bpf *skel;
+
+	skel = calloc(sizeof(*skel), 1);
+	if (!skel)
+		goto cleanup;
+	skel->ctx.sz = (void *)&skel->links - (void *)skel;
+	skel->rodata =
+		mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+		     MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+	if (skel->rodata == (void *) -1)
+		goto cleanup;
+	memcpy(skel->rodata, (void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 98);
+	skel->maps.rodata.initial_value = (__u64)(long)skel->rodata;
+	return skel;
+cleanup:
+	iterators_bpf__destroy(skel);
+	return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+	struct bpf_load_and_run_opts opts = {};
+	int err;
+
+	opts.ctx = (struct bpf_loader_ctx *)skel;
+	opts.data_sz = 6056;
+	opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+	opts.insns_sz = 2184;
+	opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x20\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x18\0\0\0\0\0\x15\x03\x04\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x85\0\0\0\x94\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\x01\0\
+\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\xa6\0\
+\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xa3\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\x01\0\0\
+\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\xb7\x03\
+\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x96\xff\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x40\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\0\x7b\x01\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\
+\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\
+\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\
+\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\x0c\0\0\
+\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x60\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\x77\x07\
+\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\
+\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\
+\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\
+\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4e\xff\0\0\0\0\x63\
+\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x18\x12\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\x7b\x01\0\0\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\0\0\0\0\
+\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\0\0\0\0\
+\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\0\0\0\
+\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\
+\0\0\xc5\x07\x17\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\x70\x6c\
+\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\
+\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x05\
+\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\x02\0\0\0\
+\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\x63\x06\
+\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0";
+	err = bpf_load_and_run(&opts);
+	if (err < 0)
+		return err;
+	skel->rodata =
+		mmap(skel->rodata, 4096, PROT_READ, MAP_SHARED | MAP_FIXED,
+			skel->maps.rodata.map_fd, 0);
+	return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+	struct iterators_bpf *skel;
+
+	skel = iterators_bpf__open();
+	if (!skel)
+		return NULL;
+	if (iterators_bpf__load(skel)) {
+		iterators_bpf__destroy(skel);
+		return NULL;
+	}
+	return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
-	struct bpf_object_skeleton *skeleton;
-	struct bpf_object *obj;
-	struct {
-		struct bpf_map *rodata;
-	} maps;
-	struct {
-		struct bpf_program *dump_bpf_map;
-		struct bpf_program *dump_bpf_prog;
-	} progs;
-	struct {
-		struct bpf_link *dump_bpf_map;
-		struct bpf_link *dump_bpf_prog;
-	} links;
-	struct iterators_bpf__rodata {
-		char dump_bpf_map____fmt[35];
-		char dump_bpf_map____fmt_1[14];
-		char dump_bpf_prog____fmt[32];
-		char dump_bpf_prog____fmt_2[17];
-	} *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
-	if (!obj)
-		return;
-	if (obj->skeleton)
-		bpf_object__destroy_skeleton(obj->skeleton);
-	free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
-	struct iterators_bpf *obj;
-
-	obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
-	if (!obj)
-		return NULL;
-	if (iterators_bpf__create_skeleton(obj))
-		goto err;
-	if (bpf_object__open_skeleton(obj->skeleton, opts))
-		goto err;
-
-	return obj;
-err:
-	iterators_bpf__destroy(obj);
-	return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
-	return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
-	return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
-	struct iterators_bpf *obj;
-
-	obj = iterators_bpf__open();
-	if (!obj)
-		return NULL;
-	if (iterators_bpf__load(obj)) {
-		iterators_bpf__destroy(obj);
-		return NULL;
-	}
-	return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
-	return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
-	return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
-	struct bpf_object_skeleton *s;
-
-	s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
-	if (!s)
-		return -1;
-	obj->skeleton = s;
-
-	s->sz = sizeof(*s);
-	s->name = "iterators_bpf";
-	s->obj = &obj->obj;
-
-	/* maps */
-	s->map_cnt = 1;
-	s->map_skel_sz = sizeof(*s->maps);
-	s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
-	if (!s->maps)
-		goto err;
-
-	s->maps[0].name = "iterator.rodata";
-	s->maps[0].map = &obj->maps.rodata;
-	s->maps[0].mmaped = (void **)&obj->rodata;
-
-	/* programs */
-	s->prog_cnt = 2;
-	s->prog_skel_sz = sizeof(*s->progs);
-	s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
-	if (!s->progs)
-		goto err;
-
-	s->progs[0].name = "dump_bpf_map";
-	s->progs[0].prog = &obj->progs.dump_bpf_map;
-	s->progs[0].link = &obj->links.dump_bpf_map;
-
-	s->progs[1].name = "dump_bpf_prog";
-	s->progs[1].prog = &obj->progs.dump_bpf_prog;
-	s->progs[1].link = &obj->links.dump_bpf_prog;
-
-	s->data_sz = 7176;
-	s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
-	return 0;
-err:
-	bpf_object__destroy_skeleton(s);
-	return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
-- 
cgit v1.2.3


From 18ef5dac934a0fb1aeb7d6bee6c81e1fa6bcc598 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 31 Jan 2022 14:05:27 -0800
Subject: bpf: Open code obj_get_info_by_fd in bpf preload.

Open code obj_get_info_by_fd in bpf preload.
It's the last part of libbpf that preload/iterators were using.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220131220528.98088-7-alexei.starovoitov@gmail.com
---
 kernel/bpf/preload/iterators/iterators.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
index 23b74916fb84..4dafe0f4f2b2 100644
--- a/kernel/bpf/preload/iterators/iterators.c
+++ b/kernel/bpf/preload/iterators/iterators.c
@@ -16,6 +16,22 @@
 int to_kernel = -1;
 int from_kernel = 0;
 
+static int __bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
+{
+	union bpf_attr attr;
+	int err;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.info.bpf_fd = bpf_fd;
+	attr.info.info_len = *info_len;
+	attr.info.info = (long) info;
+
+	err = skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
+	if (!err)
+		*info_len = attr.info.info_len;
+	return err;
+}
+
 static int send_link_to_kernel(int link_fd, const char *link_name)
 {
 	struct bpf_preload_info obj = {};
@@ -23,7 +39,7 @@ static int send_link_to_kernel(int link_fd, const char *link_name)
 	__u32 info_len = sizeof(info);
 	int err;
 
-	err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
+	err = __bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
 	if (err)
 		return err;
 	obj.link_id = info.id;
-- 
cgit v1.2.3


From e96f2d64c812d9c20adea38a9b5e08feaa21fcf5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 31 Jan 2022 14:05:28 -0800
Subject: bpf: Drop libbpf, libelf, libz dependency from bpf preload.

Drop libbpf, libelf, libz dependency from bpf preload.
This reduces bpf_preload_umd binary size
from 1.7M to 30k unstripped with debug info
and from 300k to 19k stripped.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220131220528.98088-8-alexei.starovoitov@gmail.com
---
 kernel/bpf/preload/Makefile | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1400ac58178e..baf47d9c7557 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,40 +1,16 @@
 # SPDX-License-Identifier: GPL-2.0
 
 LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
-	$(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/   \
-		DESTDIR=$(LIBBPF_DESTDIR) prefix=			       \
-		$(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
-	$(call msg,MKDIR,$@)
-	$(Q)mkdir -p $@
+LIBBPF_INCLUDE = $(LIBBPF_SRCS)/..
 
 userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
 	-I $(LIBBPF_INCLUDE) -Wno-unused-result
 
 userprogs := bpf_preload_umd
 
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
 bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
 
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
+$(obj)/bpf_preload_umd:
 
 $(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
 
-- 
cgit v1.2.3


From 4d266c247d56751c2c97e0c411212b59e90922fc Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Date: Mon, 13 Dec 2021 11:40:24 +0530
Subject: rcu/exp: Fix check for idle context in rcu_exp_handler

For PREEMPT_RCU, the rcu_exp_handler() function checks
whether the current CPU is in idle, by calling
rcu_dynticks_curr_cpu_in_eqs(). However, rcu_exp_handler()
is called in IPI handler context. So, it should be checking
the idle context using rcu_is_cpu_rrupt_from_idle(). Fix this
by using rcu_is_cpu_rrupt_from_idle() instead of
rcu_dynticks_curr_cpu_in_eqs(). Non-preempt configuration
already uses the correct check.

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 237a79989aba..1568c8ef185b 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -656,7 +656,7 @@ static void rcu_exp_handler(void *unused)
 	 */
 	if (!depth) {
 		if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
-		    rcu_dynticks_curr_cpu_in_eqs()) {
+		    rcu_is_cpu_rrupt_from_idle()) {
 			rcu_report_exp_rdp(rdp);
 		} else {
 			WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
-- 
cgit v1.2.3


From 63c564da11cbed96ec6cf0b5faf6af0b7e3624d2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 13 Dec 2021 21:00:02 -0800
Subject: rcu: Mark ->expmask access in synchronize_rcu_expedited_wait()

This commit adds a READ_ONCE() to an access to the rcu_node structure's
->expmask field to prevent compiler mischief.  Detected by KCSAN.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1568c8ef185b..60197ea24ceb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -502,7 +502,8 @@ static void synchronize_rcu_expedited_wait(void)
 		if (synchronize_rcu_expedited_wait_once(1))
 			return;
 		rcu_for_each_leaf_node(rnp) {
-			for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+			mask = READ_ONCE(rnp->expmask);
+			for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
 				rdp = per_cpu_ptr(&rcu_data, cpu);
 				if (rdp->rcu_forced_tick_exp)
 					continue;
-- 
cgit v1.2.3


From a47f9f131dfe4f765e385fa90e13032eadb00bac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 13 Dec 2021 11:05:07 -0800
Subject: rcu: Mark accesses to boost_starttime

The boost_starttime shared variable has conflicting unmarked C-language
accesses, which are dangerous at best.  This commit therefore adds
appropriate marking.  This was found by KCSAN.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 422f7e4cc08d..829ae0b7d3c0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -997,7 +997,7 @@ static int rcu_torture_boost(void *arg)
 			goto checkwait;
 
 		/* Wait for the next test interval. */
-		oldstarttime = boost_starttime;
+		oldstarttime = READ_ONCE(boost_starttime);
 		while (time_before(jiffies, oldstarttime)) {
 			schedule_timeout_interruptible(oldstarttime - jiffies);
 			if (stutter_wait("rcu_torture_boost"))
@@ -1041,10 +1041,11 @@ static int rcu_torture_boost(void *arg)
 		 * interval.  Besides, we are running at RT priority,
 		 * so delays should be relatively rare.
 		 */
-		while (oldstarttime == boost_starttime && !kthread_should_stop()) {
+		while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
 			if (mutex_trylock(&boost_mutex)) {
 				if (oldstarttime == boost_starttime) {
-					boost_starttime = jiffies + test_boost_interval * HZ;
+					WRITE_ONCE(boost_starttime,
+						   jiffies + test_boost_interval * HZ);
 					n_rcu_torture_boosts++;
 				}
 				mutex_unlock(&boost_mutex);
-- 
cgit v1.2.3


From 02e3024175274ed4bf7912e7a1281b300cec76b5 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Date: Sat, 11 Dec 2021 22:31:39 +0530
Subject: rcu/nocb: Handle concurrent nocb kthreads creation

When multiple CPUs in the same nocb gp/cb group concurrently
come online, they might try to concurrently create the same
rcuog kthread. Fix this by using nocb gp CPU's spawn mutex to
provide mutual exclusion for the rcuog kthread creation code.

[ paulmck: Whitespace fixes per kernel test robot feedback. ]

Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h      | 2 ++
 kernel/rcu/tree_nocb.h | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 486fc901bd08..24dd4b0d805f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -203,6 +203,8 @@ struct rcu_data {
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
 	struct timer_list nocb_timer;	/* Enforce finite deferral. */
 	unsigned long nocb_gp_adv_time;	/* Last call_rcu() CB adv (jiffies). */
+	struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+					    /* spawning */
 
 	/* The following fields are used by call_rcu, hence own cacheline. */
 	raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index eeafb546a7a0..1e40519d1a05 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1226,6 +1226,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 	raw_spin_lock_init(&rdp->nocb_gp_lock);
 	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
 	rcu_cblist_init(&rdp->nocb_bypass);
+	mutex_init(&rdp->nocb_gp_kthread_mutex);
 }
 
 /*
@@ -1248,13 +1249,17 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 
 	/* If we didn't spawn the GP kthread first, reorganize! */
 	rdp_gp = rdp->nocb_gp_rdp;
+	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
 	if (!rdp_gp->nocb_gp_kthread) {
 		t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
 				"rcuog/%d", rdp_gp->cpu);
-		if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+		if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+			mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
 			return;
+		}
 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
 	}
+	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
 
 	/* Spawn the kthread for this CPU. */
 	t = kthread_run(rcu_nocb_cb_kthread, rdp,
-- 
cgit v1.2.3


From eae9f147a4b02e132187a2d88a403b9ccc28212a Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Date: Mon, 13 Dec 2021 12:32:09 +0530
Subject: rcu: Remove unused rcu_state.boost

Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h        | 5 ++---
 kernel/rcu/tree_plugin.h | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 24dd4b0d805f..e9990945483f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -304,9 +304,8 @@ struct rcu_state {
 
 	/* The following fields are guarded by the root rcu_node's lock. */
 
-	u8	boost ____cacheline_internodealigned_in_smp;
-						/* Subject to priority boost. */
-	unsigned long gp_seq;			/* Grace-period sequence #. */
+	unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+						/* Grace-period sequence #. */
 	unsigned long gp_max;			/* Maximum GP duration in */
 						/*  jiffies. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..109429e70a64 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1175,8 +1175,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
 		return;
 
-	rcu_state.boost = 1;
-
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
 			   "rcub/%d", rnp_index);
 	if (WARN_ON_ONCE(IS_ERR(t)))
-- 
cgit v1.2.3


From 4b4399b2450de38916718ba9947e6cdb69c99c55 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Wed, 29 Dec 2021 00:05:10 +0800
Subject: rcu: Create per-cpu rcuc kthreads only when rcutree.use_softirq=0

The per-CPU "rcuc" kthreads are used only by kernels booted with
rcutree.use_softirq=0, but they are nevertheless unconditionally created
by kernels built with CONFIG_RCU_BOOST=y.  This results in "rcuc"
kthreads being created that are never actually used.  This commit
therefore refrains from creating these kthreads unless the kernel
is actually booted with rcutree.use_softirq=0.

Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..4e5e37e5ee3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2894,7 +2894,7 @@ static int __init rcu_spawn_core_kthreads(void)
 
 	for_each_possible_cpu(cpu)
 		per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
-	if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+	if (use_softirq)
 		return 0;
 	WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
 		  "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
-- 
cgit v1.2.3


From c8db27dd0ea8071d2ea29a1a401c4ccc611ec6c1 Mon Sep 17 00:00:00 2001
From: Alison Chaiken <achaiken@aurora.tech>
Date: Tue, 11 Jan 2022 15:32:50 -0800
Subject: rcu: Move kthread_prio bounds-check to a separate function

Move the bounds-check of the kthread_prio cmdline parameter to a new
function in order to faciliate a different callsite.

Signed-off-by: Alison Chaiken <achaiken@aurora.tech>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4e5e37e5ee3c..5bf0312f6676 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4440,26 +4440,10 @@ static int rcu_pm_notify(struct notifier_block *self,
 static int __init rcu_spawn_gp_kthread(void)
 {
 	unsigned long flags;
-	int kthread_prio_in = kthread_prio;
 	struct rcu_node *rnp;
 	struct sched_param sp;
 	struct task_struct *t;
 
-	/* Force priority into range. */
-	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
-	    && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
-		kthread_prio = 2;
-	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
-		kthread_prio = 1;
-	else if (kthread_prio < 0)
-		kthread_prio = 0;
-	else if (kthread_prio > 99)
-		kthread_prio = 99;
-
-	if (kthread_prio != kthread_prio_in)
-		pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
-			 kthread_prio, kthread_prio_in);
-
 	rcu_scheduler_fully_active = 1;
 	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
 	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
@@ -4584,6 +4568,28 @@ static void __init rcu_init_one(void)
 	}
 }
 
+/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+	int kthread_prio_in = kthread_prio;
+
+	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+	    && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+		kthread_prio = 2;
+	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+		kthread_prio = 1;
+	else if (kthread_prio < 0)
+		kthread_prio = 0;
+	else if (kthread_prio > 99)
+		kthread_prio = 99;
+
+	if (kthread_prio != kthread_prio_in)
+		pr_alert("%s: Limited prio to %d from %d\n",
+			 __func__, kthread_prio, kthread_prio_in);
+}
+
 /*
  * Compute the rcu_node tree geometry from kernel parameters.  This cannot
  * replace the definitions in tree.h because those are needed to size
@@ -4744,6 +4750,7 @@ void __init rcu_init(void)
 
 	kfree_rcu_batch_init();
 	rcu_bootup_announce();
+	sanitize_kthread_prio();
 	rcu_init_geometry();
 	rcu_init_one();
 	if (dump_tree)
-- 
cgit v1.2.3


From 54577e23fa0791599db1a3d86fc8e7a205d3da75 Mon Sep 17 00:00:00 2001
From: Alison Chaiken <achaiken@aurora.tech>
Date: Tue, 11 Jan 2022 15:32:51 -0800
Subject: rcu: Make priority of grace-period thread consistent

The priority of RCU grace period threads is set to kthread_prio when
they are launched from rcu_spawn_gp_kthread().  The same is not true
of rcu_spawn_one_nocb_kthread().  Accordingly, add priority elevation
to rcu_spawn_one_nocb_kthread().

Signed-off-by: Alison Chaiken <achaiken@aurora.tech>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 1e40519d1a05..ea889cbfc3b9 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1239,6 +1239,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	struct rcu_data *rdp_gp;
 	struct task_struct *t;
+	struct sched_param sp;
 
 	if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
 		return;
@@ -1248,6 +1249,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 		return;
 
 	/* If we didn't spawn the GP kthread first, reorganize! */
+	sp.sched_priority = kthread_prio;
 	rdp_gp = rdp->nocb_gp_rdp;
 	mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
 	if (!rdp_gp->nocb_gp_kthread) {
@@ -1258,6 +1260,8 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 			return;
 		}
 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+		if (kthread_prio)
+			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	}
 	mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
 
-- 
cgit v1.2.3


From c8b16a65267e35ecc5621dbc81cbe7e5b0992fce Mon Sep 17 00:00:00 2001
From: Alison Chaiken <achaiken@aurora.tech>
Date: Tue, 11 Jan 2022 15:32:52 -0800
Subject: rcu: Elevate priority of offloaded callback threads

When CONFIG_PREEMPT_RT=y, the rcutree.kthread_prio command-line
parameter signals initialization code to boost the priority of rcuc
callbacks to the designated value.  With the additional
CONFIG_RCU_NOCB_CPU=y configuration and an additional rcu_nocbs
command-line parameter, the callbacks on the listed cores are
offloaded to new rcuop kthreads that are not pinned to the cores whose
post-grace-period work is performed.  While the rcuop kthreads perform
the same function as the rcuc kthreads they offload, the kthread_prio
parameter only boosts the priority of the rcuc kthreads.  Fix this
inconsistency by elevating rcuop kthreads to the same priority as the rcuc
kthreads.

Signed-off-by: Alison Chaiken <achaiken@aurora.tech>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c      | 2 +-
 kernel/rcu/tree_nocb.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5bf0312f6676..9e4c5b281f00 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -153,7 +153,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
 static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
 
-/* rcuc/rcub kthread realtime priority */
+/* rcuc/rcub/rcuop kthread realtime priority */
 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
 module_param(kthread_prio, int, 0444);
 
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index ea889cbfc3b9..547c41437c76 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1270,6 +1270,9 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 			"rcuo%c/%d", rcu_state.abbr, cpu);
 	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
 		return;
+
+	if (kthread_prio)
+		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	WRITE_ONCE(rdp->nocb_cb_kthread, t);
 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
 }
-- 
cgit v1.2.3


From 10c535787436d62ea28156a4b91365fd89b5a432 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 21 Jan 2022 12:40:08 -0800
Subject: rcu: Don't deboost before reporting expedited quiescent state

Currently rcu_preempt_deferred_qs_irqrestore() releases rnp->boost_mtx
before reporting the expedited quiescent state.  Under heavy real-time
load, this can result in this function being preempted before the
quiescent state is reported, which can in turn prevent the expedited grace
period from completing.  Tim Murray reports that the resulting expedited
grace periods can take hundreds of milliseconds and even more than one
second, when they should normally complete in less than a millisecond.

This was fine given that there were no particular response-time
constraints for synchronize_rcu_expedited(), as it was designed
for throughput rather than latency.  However, some users now need
sub-100-millisecond response-time constratints.

This patch therefore follows Neeraj's suggestion (seconded by Tim and
by Uladzislau Rezki) of simply reversing the two operations.

Reported-by: Tim Murray <timmurray@google.com>
Reported-by: Joel Fernandes <joelaf@google.com>
Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reviewed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Tested-by: Tim Murray <timmurray@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: <stable@vger.kernel.org> # 5.4.x
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 109429e70a64..02ac057ba3f8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -556,16 +556,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
 
-		/* Unboost if we were boosted. */
-		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
-			rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
-
 		/*
 		 * If this was the last task on the expedited lists,
 		 * then we need to report up the rcu_node hierarchy.
 		 */
 		if (!empty_exp && empty_exp_now)
 			rcu_report_exp_rnp(rnp, true);
+
+		/* Unboost if we were boosted. */
+		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+			rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
 	} else {
 		local_irq_restore(flags);
 	}
-- 
cgit v1.2.3


From c9515875850fefcc79492c5189fe8431e75ddec5 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Tue, 25 Jan 2022 10:47:44 +0800
Subject: rcu: Add per-CPU rcuc task dumps to RCU CPU stall warnings

When the rcutree.use_softirq kernel boot parameter is set to zero, all
RCU_SOFTIRQ processing is carried out by the per-CPU rcuc kthreads.
If these kthreads are being starved, quiescent states will not be
reported, which in turn means that the grace period will not end, which
can in turn trigger RCU CPU stall warnings.  This commit therefore dumps
stack traces of stalled CPUs' rcuc kthreads, which can help identify
what is preventing those kthreads from running.

Suggested-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Reviewed-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        |  3 +++
 kernel/rcu/tree.h        |  1 +
 kernel/rcu/tree_plugin.h |  3 +++
 kernel/rcu/tree_stall.h  | 35 +++++++++++++++++++++++++++++++++++
 4 files changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9e4c5b281f00..bd9b2af247ab 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2850,10 +2850,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
 {
 	unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
 	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+	unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
 	int spincnt;
 
 	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
 	for (spincnt = 0; spincnt < 10; spincnt++) {
+		WRITE_ONCE(*j, jiffies);
 		local_bh_disable();
 		*statusp = RCU_KTHREAD_RUNNING;
 		local_irq_disable();
@@ -2874,6 +2876,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 	schedule_timeout_idle(2);
 	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
 	*statusp = RCU_KTHREAD_WAITING;
+	WRITE_ONCE(*j, jiffies);
 }
 
 static struct smp_hotplug_thread rcu_cpu_thread_spec = {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e9990945483f..b84cc5742c31 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -239,6 +239,7 @@ struct rcu_data {
 					/* rcuc per-CPU kthread or NULL. */
 	unsigned int rcu_cpu_kthread_status;
 	char rcu_cpu_has_work;
+	unsigned long rcuc_activity;
 
 	/* 7) Diagnostic data, including RCU CPU stall warnings. */
 	unsigned int softirq_snap;	/* Snapshot of softirq activity. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac057ba3f8..8167cab1bffc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -996,12 +996,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
  */
 static void rcu_cpu_kthread_setup(unsigned int cpu)
 {
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 #ifdef CONFIG_RCU_BOOST
 	struct sched_param sp;
 
 	sp.sched_priority = kthread_prio;
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
+
+	WRITE_ONCE(rdp->rcuc_activity, jiffies);
 }
 
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 21bebf7c9030..0c5d8516516a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -379,6 +379,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
 	return j > 2 * HZ;
 }
 
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+	unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+	if (jp)
+		*jp = j;
+	return j > 2 * HZ;
+}
+
 /*
  * Print out diagnostic information for the specified stalled CPU.
  *
@@ -430,6 +439,29 @@ static void print_cpu_stall_info(int cpu)
 	       falsepositive ? " (false positive?)" : "");
 }
 
+static void rcuc_kthread_dump(struct rcu_data *rdp)
+{
+	int cpu;
+	unsigned long j;
+	struct task_struct *rcuc;
+
+	rcuc = rdp->rcu_cpu_kthread_task;
+	if (!rcuc)
+		return;
+
+	cpu = task_cpu(rcuc);
+	if (cpu_is_offline(cpu) || idle_cpu(cpu))
+		return;
+
+	if (!rcu_is_rcuc_kthread_starving(rdp, &j))
+		return;
+
+	pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
+	sched_show_task(rcuc);
+	if (!trigger_single_cpu_backtrace(cpu))
+		dump_cpu_task(cpu);
+}
+
 /* Complain about starvation of grace-period kthread.  */
 static void rcu_check_gp_kthread_starvation(void)
 {
@@ -601,6 +633,9 @@ static void print_cpu_stall(unsigned long gps)
 	rcu_check_gp_kthread_expired_fqs_timer();
 	rcu_check_gp_kthread_starvation();
 
+	if (!use_softirq)
+		rcuc_kthread_dump(rdp);
+
 	rcu_dump_cpu_stacks();
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-- 
cgit v1.2.3


From 6f81bd6a4e305d15d9c2a6a350e2876a7a814d7e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 6 Dec 2021 15:12:14 -0800
Subject: rcutorture: Print message before invoking ->cb_barrier()

The various ->cb_barrier() functions, for example, rcu_barrier(),
sometimes cause rcutorture hangs.  But currently, the last console
message is the unenlightening "Stopping rcu_torture_stats".  This commit
therefore prints a message of the form "rcu_torture_cleanup: Invoking
rcu_barrier+0x0/0x1e0()" to help point people in the right direction.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 422f7e4cc08d..00400aef5818 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2905,8 +2905,10 @@ rcu_torture_cleanup(void)
 	int i;
 
 	if (torture_cleanup_begin()) {
-		if (cur_ops->cb_barrier != NULL)
+		if (cur_ops->cb_barrier != NULL) {
+			pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
 			cur_ops->cb_barrier();
+		}
 		return;
 	}
 	if (!cur_ops) {
@@ -2961,8 +2963,10 @@ rcu_torture_cleanup(void)
 	 * Wait for all RCU callbacks to fire, then do torture-type-specific
 	 * cleanup operations.
 	 */
-	if (cur_ops->cb_barrier != NULL)
+	if (cur_ops->cb_barrier != NULL) {
+		pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
 		cur_ops->cb_barrier();
+	}
 	if (cur_ops->cleanup != NULL)
 		cur_ops->cleanup();
 
-- 
cgit v1.2.3


From 2b4a7f20f160e6440848c62a70ee5dc5237a2c8b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 16 Dec 2021 12:23:31 -0800
Subject: torture: Distinguish kthread stopping and being asked to stop

Right now, if a given kthread (call it "kthread") realizes that it needs
to stop, "Stopping kthread" is written to the console.  When the cleanup
code decides that it is time to stop that kthread, "Stopping kthread
tasks" is written to the console.  These two events might happen in
either order, especially in the case of time-based torture-test shutdown.

But it is hard to distinguish these, especially for those unfamiliar with
the torture tests.  This commit therefore changes the first case from
"Stopping kthread" to "kthread is stopping" to make things more clear.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/torture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index ef27a6c82451..f55d803f995d 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title)
 {
 	char buf[128];
 
-	snprintf(buf, sizeof(buf), "Stopping %s", title);
+	snprintf(buf, sizeof(buf), "%s is stopping", title);
 	VERBOSE_TOROUT_STRING(buf);
 	while (!kthread_should_stop()) {
 		torture_shutdown_absorb(title);
-- 
cgit v1.2.3


From 05b724655bf1908abf531dd0bce455e55703a3a8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 16 Dec 2021 15:36:02 -0800
Subject: rcutorture: Increase visibility of forward-progress hangs

This commit adds a few pr_alert() calls to rcutorture's forward-progress
testing in order to better diagnose shutdown-time hangs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 00400aef5818..fefc3fa1a9c2 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2281,6 +2281,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
 	unsigned long stopat;
 	static DEFINE_TORTURE_RANDOM(trs);
 
+	pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
 	if (!cur_ops->sync)
 		return; // Cannot do need_resched() forward progress testing without ->sync.
 	if (cur_ops->call && cur_ops->cb_barrier) {
@@ -2325,6 +2326,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
 	if (selfpropcb) {
 		WRITE_ONCE(fcs.stop, 1);
 		cur_ops->sync(); /* Wait for running CB to complete. */
+		pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
 		cur_ops->cb_barrier(); /* Wait for queued callbacks. */
 	}
 
@@ -2353,6 +2355,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 	unsigned long stopat;
 	unsigned long stoppedat;
 
+	pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
 	if (READ_ONCE(rcu_fwd_emergency_stop))
 		return; /* Get out of the way quickly, no GP wait! */
 	if (!cur_ops->call)
@@ -2414,6 +2417,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 	n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
 	cver = READ_ONCE(rcu_torture_current_version) - cver;
 	gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+	pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
 	cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
 	(void)rcu_torture_fwd_prog_cbfree(rfp);
 
-- 
cgit v1.2.3


From e22ef8df415d924428e35c9c112526306e684adc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Dec 2021 12:33:53 -0800
Subject: rcutorture: Make rcu_fwd_cb_nodelay be a counter

Back when only one rcutorture kthread could do forward-progress testing,
it was just fine for rcu_fwd_cb_nodelay to be a non-atomic bool.  It was
set at the start of forward-progress testing and cleared at the end.
But now that there are multiple threads, the value can be cleared while
one of the threads is still doing forward-progress testing.  This commit
therefore makes rcu_fwd_cb_nodelay be an atomic counter, replacing the
WRITE_ONCE() operations with atomic_inc() and atomic_dec().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fefc3fa1a9c2..afe95c694895 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 
-static bool rcu_fwd_cb_nodelay;		/* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay;	/* Short rcu_torture_delay() delays. */
 
 /*
  * Allocate an element from the rcu_tortures pool.
@@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
 	 * period, and we want a long delay occasionally to trigger
 	 * force_quiescent_state. */
 
-	if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+	if (!atomic_read(&rcu_fwd_cb_nodelay) &&
 	    !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
 		started = cur_ops->get_gp_seq();
 		ts = rcu_trace_clock_local();
@@ -1276,7 +1276,7 @@ rcu_torture_writer(void *arg)
 		boot_ended = rcu_inkernel_boot_has_ended();
 		stutter_waited = stutter_wait("rcu_torture_writer");
 		if (stutter_waited &&
-		    !READ_ONCE(rcu_fwd_cb_nodelay) &&
+		    !atomic_read(&rcu_fwd_cb_nodelay) &&
 		    !cur_ops->slow_gps &&
 		    !torture_must_stop() &&
 		    boot_ended)
@@ -2290,7 +2290,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
 	}
 
 	/* Tight loop containing cond_resched(). */
-	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+	atomic_inc(&rcu_fwd_cb_nodelay);
 	cur_ops->sync(); /* Later readers see above write. */
 	if  (selfpropcb) {
 		WRITE_ONCE(fcs.stop, 0);
@@ -2335,7 +2335,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
 		destroy_rcu_head_on_stack(&fcs.rh);
 	}
 	schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
-	WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+	atomic_dec(&rcu_fwd_cb_nodelay);
 }
 
 /* Carry out call_rcu() forward-progress testing. */
@@ -2362,7 +2362,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 		return; /* Can't do call_rcu() fwd prog without ->call. */
 
 	/* Loop continuously posting RCU callbacks. */
-	WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+	atomic_inc(&rcu_fwd_cb_nodelay);
 	cur_ops->sync(); /* Later readers see above write. */
 	WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
 	stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2435,7 +2435,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 	}
 	schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
 	tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
-	WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+	atomic_dec(&rcu_fwd_cb_nodelay);
 }
 
 
-- 
cgit v1.2.3


From 02b51a1cf47977d8772c7dcc363ef6a1e6e59f21 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 17 Dec 2021 15:05:05 -0800
Subject: rcutorture: Add end-of-test check to rcu_torture_fwd_prog() loop

The second and subsequent forward-progress kthreads loop waiting for
the first forward-progress kthread to start the next test interval.
Unfortunately, if the test ends while one of those kthreads is waiting,
the test will hang.  This hang occurs because that wait loop fails to
check for the end of the test.  This commit therefore adds an end-of-test
check to that wait loop.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index afe95c694895..e99658efd97f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2515,7 +2515,7 @@ static int rcu_torture_fwd_prog(void *args)
 			firsttime = false;
 			WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
 		} else {
-			while (READ_ONCE(rcu_fwd_seq) == oldseq)
+			while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
 				schedule_timeout_interruptible(1);
 			oldseq = READ_ONCE(rcu_fwd_seq);
 		}
-- 
cgit v1.2.3


From 89440d2dad0cc2a781290470cb90402ebba481fc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 28 Dec 2021 15:59:38 -0800
Subject: rcutorture: Fix rcu_fwd_mutex deadlock

The rcu_torture_fwd_cb_hist() function acquires rcu_fwd_mutex, but is
invoked from rcutorture_oom_notify() function, which hold this same
mutex across this call.  This commit fixes the resulting deadlock.

Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index e99658efd97f..1c8f40b90f70 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2180,7 +2180,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
 	for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
 		if (rfp->n_launders_hist[i].n_launders > 0)
 			break;
-	mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
 	pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
 		 __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
 	gps_old = rfp->rcu_launder_gp_seq_start;
@@ -2193,7 +2192,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
 		gps_old = gps;
 	}
 	pr_cont("\n");
-	mutex_unlock(&rcu_fwd_mutex);
 }
 
 /* Callback function for continuous-flood RCU callbacks. */
@@ -2431,7 +2429,9 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 			 n_launders, n_launders_sa,
 			 n_max_gps, n_max_cbs, cver, gps);
 		atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+		mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
 		rcu_torture_fwd_cb_hist(rfp);
+		mutex_unlock(&rcu_fwd_mutex);
 	}
 	schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
 	tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
-- 
cgit v1.2.3


From 6b8646a9d37c6324cf994dbefb75f3eb20b109ab Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 3 Jan 2022 06:07:09 -0800
Subject: torture: Wake up kthreads after storing task_struct pointer

Currently, _torture_create_kthread() uses kthread_run() to create
torture-test kthreads, which means that the resulting task_struct
pointer is stored after the newly created kthread has been marked
runnable.  This in turn can cause spurious failure of checks for
code being run by a particular kthread.  This commit therefore changes
_torture_create_kthread() to use kthread_create(), then to do an explicit
wake_up_process() after the task_struct pointer has been stored.

Reported-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/torture.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index f55d803f995d..789aeb0e1159 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
 	int ret = 0;
 
 	VERBOSE_TOROUT_STRING(m);
-	*tp = kthread_run(fn, arg, "%s", s);
+	*tp = kthread_create(fn, arg, "%s", s);
 	if (IS_ERR(*tp)) {
 		ret = PTR_ERR(*tp);
 		TOROUT_ERRSTRING(f);
 		*tp = NULL;
+		return ret;
 	}
+	wake_up_process(*tp);  // Process is sleeping, so ordering provided.
 	torture_shuffle_task_register(*tp);
 	return ret;
 }
-- 
cgit v1.2.3


From 9c0f1c7fd7c6e1e0f5b84e20c577fbab62563d03 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 20:29:10 -0800
Subject: rcutorture: Enable limited callback-flooding tests of SRCU

This commit allows up to 50,000 callbacks worth of callback-flooding
tests of SRCU.  The goal of this change is to exercise Tree SRCU's
ability to transition from SRCU_SIZE_SMALL to SRCU_SIZE_BIG triggered
by callback-queue-time lock contention.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1c8f40b90f70..b41db719085e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
+	.cbflood_max	= 50000,
 	.irq_capable	= 1,
 	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
 	.name		= "srcu"
@@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = {
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
+	.cbflood_max	= 50000,
 	.irq_capable	= 1,
 	.no_pi_lock	= IS_ENABLED(CONFIG_TINY_SRCU),
 	.name		= "srcud"
-- 
cgit v1.2.3


From fc153c1c58cb8c3bb3b443b4d7dc3211ff5f65fc Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Sun, 5 Dec 2021 22:38:15 -0500
Subject: clocksource: Add a Kconfig option for WATCHDOG_MAX_SKEW

A watchdog maximum skew of 100us may still be too small for
some systems or archs. It may also be too small when some kernel
debug config options are enabled.  So add a new Kconfig option
CLOCKSOURCE_WATCHDOG_MAX_SKEW_US to allow kernel builders to have more
control on the threshold for marking clocksource as unstable.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/time/Kconfig       | 9 +++++++++
 kernel/time/clocksource.c | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 04bfd62f5e5c..27b7868b5c30 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -181,5 +181,14 @@ config HIGH_RES_TIMERS
 	  hardware is not capable then this option only increases
 	  the size of the kernel image.
 
+config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+	int "Clocksource watchdog maximum allowable skew (in μs)"
+	depends on CLOCKSOURCE_WATCHDOG
+	range 50 1000
+	default 100
+	help
+	  Specify the maximum amount of allowable watchdog skew in
+	  microseconds before reporting the clocksource to be unstable.
+
 endmenu
 endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1cf73807b450..95d7ca35bdf2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,7 +107,13 @@ static u64 suspend_start;
  * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
  * a lower bound for cs->uncertainty_margin values when registering clocks.
  */
-#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#define MAX_SKEW_USEC	CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#else
+#define MAX_SKEW_USEC	100
+#endif
+
+#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
-- 
cgit v1.2.3


From bfdf4e6208051ed7165b2e92035b4bf11f43eb63 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 27 Jan 2022 10:27:20 -0500
Subject: rseq: Remove broken uapi field layout on 32-bit little endian

The rseq rseq_cs.ptr.{ptr32,padding} uapi endianness handling is
entirely wrong on 32-bit little endian: a preprocessor logic mistake
wrongly uses the big endian field layout on 32-bit little endian
architectures.

Fortunately, those ptr32 accessors were never used within the kernel,
and only meant as a convenience for user-space.

Remove those and replace the whole rseq_cs union by a __u64 type, as
this is the only thing really needed to express the ABI. Document how
32-bit architectures are meant to interact with this field.

Fixes: ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union, update includes")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220127152720.25898-1-mathieu.desnoyers@efficios.com
---
 kernel/rseq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rseq.c b/kernel/rseq.c
index 6d45ac3dae7f..97ac20b4f738 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 	int ret;
 
 #ifdef CONFIG_64BIT
-	if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+	if (get_user(ptr, &t->rseq->rseq_cs))
 		return -EFAULT;
 #else
-	if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+	if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
 		return -EFAULT;
 #endif
 	if (!ptr) {
@@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t)
 	 * Set rseq_cs to NULL.
 	 */
 #ifdef CONFIG_64BIT
-	return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+	return put_user(0UL, &t->rseq->rseq_cs);
 #else
-	if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+	if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
 		return -EFAULT;
 	return 0;
 #endif
-- 
cgit v1.2.3


From c8eaf6ac76f40f6c59fc7d056e2e08c4a57ea9c7 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Fri, 28 Jan 2022 17:50:25 +0800
Subject: sched: move autogroup sysctls into its own file

move autogroup sysctls to autogroup.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220128095025.8745-1-nizhen@uniontech.com
---
 kernel/sched/autogroup.c | 23 +++++++++++++++++++++++
 kernel/sched/autogroup.h |  1 +
 kernel/sysctl.c          | 11 -----------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 8629b37d118e..31dd2593145e 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -9,6 +9,28 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_autogroup_sysctls[] = {
+	{
+		.procname       = "sched_autogroup_enabled",
+		.data           = &sysctl_sched_autogroup_enabled,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{}
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
 void __init autogroup_init(struct task_struct *init_task)
 {
 	autogroup_default.tg = &root_task_group;
@@ -198,6 +220,7 @@ void sched_autogroup_exit(struct signal_struct *sig)
 static int __init setup_autogroup(char *str)
 {
 	sysctl_sched_autogroup_enabled = 0;
+	sched_autogroup_sysctl_init();
 
 	return 1;
 }
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90fcbfdd70c3 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -27,6 +27,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
+	extern unsigned int sysctl_sched_autogroup_enabled;
 	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
 
 	if (enabled && task_wants_autogroup(p, tg))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ae443b2882e..1cb7ca68cd4e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1750,17 +1750,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= sysctl_sched_uclamp_handler,
 	},
 #endif
-#ifdef CONFIG_SCHED_AUTOGROUP
-	{
-		.procname	= "sched_autogroup_enabled",
-		.data		= &sysctl_sched_autogroup_enabled,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.procname	= "sched_cfs_bandwidth_slice_us",
-- 
cgit v1.2.3


From d680ff24e9e14444c63945b43a37ede7cd6958f9 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 31 Jan 2022 09:24:51 +0200
Subject: perf/core: Fix address filter parser for multiple filters

Reset appropriate variables in the parser loop between parsing separate
filters, so that they do not interfere with parsing the next filter.

Fixes: 375637bc524952 ("perf/core: Introduce address range filtering")
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220131072453.2839535-4-adrian.hunter@intel.com
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 76c754e45d01..2889b82fb75e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10558,8 +10558,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 			}
 
 			/* ready to consume more filters */
+			kfree(filename);
+			filename = NULL;
 			state = IF_STATE_ACTION;
 			filter = NULL;
+			kernel = 0;
 		}
 	}
 
-- 
cgit v1.2.3


From 58b2ff2c18b1e1d7232b8007a5698ec4ee7a7a0d Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 31 Jan 2022 09:24:53 +0200
Subject: perf/core: Allow kernel address filter when not filtering the kernel

The so-called 'kernel' address filter can also be useful for filtering
fixed addresses in user space.  Allow that.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220131072453.2839535-6-adrian.hunter@intel.com
---
 kernel/events/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2889b82fb75e..afbf388a5176 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10515,8 +10515,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 		 */
 		if (state == IF_STATE_END) {
 			ret = -EINVAL;
-			if (kernel && event->attr.exclude_kernel)
-				goto fail;
 
 			/*
 			 * ACTION "filter" must have a non-zero length region
-- 
cgit v1.2.3


From 322cbb50de711814c42fb088f6d31901502c711a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:39:13 +0100
Subject: block: remove genhd.h

There is no good reason to keep genhd.h separate from the main blkdev.h
header that includes it.  So fold the contents of genhd.h into blkdev.h
and remove genhd.h entirely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20220124093913.742411-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/power/hibernate.c | 1 -
 kernel/power/swap.c      | 1 -
 2 files changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..a94044197c4a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
-#include <linux/genhd.h>
 #include <linux/ktime.h>
 #include <linux/security.h>
 #include <linux/secretmem.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ad10359030a4..f1bd03129575 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
 #include <linux/file.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
-#include <linux/genhd.h>
 #include <linux/device.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-- 
cgit v1.2.3


From 07888c665b405b1cd3577ddebfeb74f4717a84c4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Jan 2022 10:11:05 +0100
Subject: block: pass a block_device and opf to bio_alloc

Pass the block_device and operation that we plan to use this bio for to
bio_alloc to optimize the assignment.  NULL/0 can be passed, both for the
passthrough case on a raw request_queue and to temporarily avoid
refactoring some nasty code.

Also move the gfp_mask argument after the nr_vecs argument for a much
more logical calling convention matching what most of the kernel does.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220124091107.642561-18-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/power/swap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1bd03129575..6c4f983cbacc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -276,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+	bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
+			GFP_NOIO | __GFP_HIGH);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio_set_dev(bio, hib_resume_bdev);
-	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		pr_err("Adding page to bio failed at %llu\n",
-- 
cgit v1.2.3


From b1f866b013e6e5583f2f0bf4a61d13eddb9a1799 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 08:05:48 +0100
Subject: block: remove blk_needs_flush_plug

blk_needs_flush_plug fails to account for the cb_list, which needs
flushing as well.  Remove it and just check if there is a plug instead
of poking into the internals of the plug structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220127070549.1377856-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c       | 2 +-
 kernel/sched/core.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..11fc6c9df9f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -735,7 +735,7 @@ void __noreturn do_exit(long code)
 	struct task_struct *tsk = current;
 	int group_dead;
 
-	WARN_ON(blk_needs_flush_plug(tsk));
+	WARN_ON(tsk->plug);
 
 	/*
 	 * If do_dead is called because this processes oopsed, it's possible
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 848eaa0efe0e..3487bb92d1f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6344,7 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
-	if (blk_needs_flush_plug(tsk))
+	if (tsk->plug)
 		blk_flush_plug(tsk->plug, true);
 }
 
-- 
cgit v1.2.3


From aa8dcccaf32bfdc09f2aff089d5d60c37da5b7b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Jan 2022 08:05:49 +0100
Subject: block: check that there is a plug in blk_flush_plug

Rename blk_flush_plug to __blk_flush_plug and add a wrapper that includes
the NULL check instead of open coding that check everywhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20220127070549.1377856-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/sched/core.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3487bb92d1f2..46152982e400 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6344,8 +6344,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
-	if (tsk->plug)
-		blk_flush_plug(tsk->plug, true);
+	blk_flush_plug(tsk->plug, true);
 }
 
 static void sched_update_worker(struct task_struct *tsk)
@@ -8371,9 +8370,7 @@ int io_schedule_prepare(void)
 	int old_iowait = current->in_iowait;
 
 	current->in_iowait = 1;
-	if (current->plug)
-		blk_flush_plug(current->plug, true);
-
+	blk_flush_plug(current->plug, true);
 	return old_iowait;
 }
 
-- 
cgit v1.2.3


From fe13889c390e14205e064d7e159e61eb5da4b1c3 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Fri, 28 Jan 2022 19:07:27 +0800
Subject: genirq, softirq: Use in_hardirq() instead of in_irq()

Replace the obsolete and ambiguos macro in_irq() with the new macro
in_hardirq().

Signed-off-by: Changbin Du <changbin.du@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220128110727.5110-1-changbin.du@gmail.com
---
 kernel/irq/irqdesc.c |  4 ++--
 kernel/softirq.c     | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..6167d32e27da 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc)
 		return -EINVAL;
 
 	data = irq_desc_get_irq_data(desc);
-	if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+	if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
 		return -EPERM;
 
 	generic_handle_irq_desc(desc);
@@ -676,7 +676,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
  */
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
 {
-	WARN_ON_ONCE(!in_irq());
+	WARN_ON_ONCE(!in_hardirq());
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
 }
 EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 41f470929e99..fac801815554 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 	u32 pending;
 	int curcnt;
 
-	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(in_hardirq());
 	lockdep_assert_irqs_enabled();
 
 	local_irq_save(flags);
@@ -305,7 +305,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
 
-	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(in_hardirq());
 
 	raw_local_irq_save(flags);
 	/*
@@ -352,14 +352,14 @@ static void __local_bh_enable(unsigned int cnt)
  */
 void _local_bh_enable(void)
 {
-	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(in_hardirq());
 	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(_local_bh_enable);
 
 void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 {
-	WARN_ON_ONCE(in_irq());
+	WARN_ON_ONCE(in_hardirq());
 	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_disable();
@@ -618,7 +618,7 @@ static inline void tick_irq_exit(void)
 
 	/* Make sure that timer wheel updates are propagated */
 	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
-		if (!in_irq())
+		if (!in_hardirq())
 			tick_nohz_irq_exit();
 	}
 #endif
-- 
cgit v1.2.3


From d7e7b42f4f956f2c68ad8cda87d750093dbba737 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 3 Feb 2022 11:17:27 -0800
Subject: bpf: Fix a btf decl_tag bug when tagging a function

syzbot reported a btf decl_tag bug with stack trace below:

  general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN
  KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
  CPU: 0 PID: 3592 Comm: syz-executor914 Not tainted 5.16.0-syzkaller-11424-gb7892f7d5cb2 #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  RIP: 0010:btf_type_vlen include/linux/btf.h:231 [inline]
  RIP: 0010:btf_decl_tag_resolve+0x83e/0xaa0 kernel/bpf/btf.c:3910
  ...
  Call Trace:
   <TASK>
   btf_resolve+0x251/0x1020 kernel/bpf/btf.c:4198
   btf_check_all_types kernel/bpf/btf.c:4239 [inline]
   btf_parse_type_sec kernel/bpf/btf.c:4280 [inline]
   btf_parse kernel/bpf/btf.c:4513 [inline]
   btf_new_fd+0x19fe/0x2370 kernel/bpf/btf.c:6047
   bpf_btf_load kernel/bpf/syscall.c:4039 [inline]
   __sys_bpf+0x1cbb/0x5970 kernel/bpf/syscall.c:4679
   __do_sys_bpf kernel/bpf/syscall.c:4738 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:4736 [inline]
   __x64_sys_bpf+0x75/0xb0 kernel/bpf/syscall.c:4736
   do_syscall_x64 arch/x86/entry/common.c:50 [inline]
   do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
   entry_SYSCALL_64_after_hwframe+0x44/0xae

The kasan error is triggered with an illegal BTF like below:
   type 0: void
   type 1: int
   type 2: decl_tag to func type 3
   type 3: func to func_proto type 8
The total number of types is 4 and the type 3 is illegal
since its func_proto type is out of range.

Currently, the target type of decl_tag can be struct/union, var or func.
Both struct/union and var implemented their own 'resolve' callback functions
and hence handled properly in kernel.
But func type doesn't have 'resolve' callback function. When
btf_decl_tag_resolve() tries to check func type, it tries to get
vlen of its func_proto type, which triggered the above kasan error.

To fix the issue, btf_decl_tag_resolve() needs to do btf_func_check()
before trying to accessing func_proto type.
In the current implementation, func type is checked with
btf_func_check() in the main checking function btf_check_all_types().
To fix the above kasan issue, let us implement 'resolve' callback
func type properly. The 'resolve' callback will be also called
in btf_check_all_types() for func types.

Fixes: b5ea834dde6b ("bpf: Support for new btf kind BTF_KIND_TAG")
Reported-by: syzbot+53619be9444215e785ed@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220203191727.741862-1-yhs@fb.com
---
 kernel/bpf/btf.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b983cee8d196..9b47972c57a4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -419,6 +419,9 @@ static struct btf_type btf_void;
 static int btf_resolve(struct btf_verifier_env *env,
 		       const struct btf_type *t, u32 type_id);
 
+static int btf_func_check(struct btf_verifier_env *env,
+			  const struct btf_type *t);
+
 static bool btf_type_is_modifier(const struct btf_type *t)
 {
 	/* Some of them is not strictly a C modifier
@@ -595,6 +598,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
 	       btf_type_is_struct(t) ||
 	       btf_type_is_array(t) ||
 	       btf_type_is_var(t) ||
+	       btf_type_is_func(t) ||
 	       btf_type_is_decl_tag(t) ||
 	       btf_type_is_datasec(t);
 }
@@ -3571,9 +3575,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_func_resolve(struct btf_verifier_env *env,
+			    const struct resolve_vertex *v)
+{
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	int err;
+
+	err = btf_func_check(env, t);
+	if (err)
+		return err;
+
+	env_stack_pop_resolved(env, next_type_id, 0);
+	return 0;
+}
+
 static struct btf_kind_operations func_ops = {
 	.check_meta = btf_func_check_meta,
-	.resolve = btf_df_resolve,
+	.resolve = btf_func_resolve,
 	.check_member = btf_df_check_member,
 	.check_kflag_member = btf_df_check_kflag_member,
 	.log_details = btf_ref_type_log,
@@ -4194,7 +4213,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
 		return !btf_resolved_type_id(btf, type_id) &&
 		       !btf_resolved_type_size(btf, type_id);
 
-	if (btf_type_is_decl_tag(t))
+	if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
 		return btf_resolved_type_id(btf, type_id) &&
 		       !btf_resolved_type_size(btf, type_id);
 
@@ -4284,12 +4303,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 			if (err)
 				return err;
 		}
-
-		if (btf_type_is_func(t)) {
-			err = btf_func_check(env, t);
-			if (err)
-				return err;
-		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 53725c4cbd4567423ff6143c5d10300e53ecf52a Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sun, 23 Jan 2022 20:45:07 +0800
Subject: cpufreq: schedutil: Use to_gov_attr_set() to get the gov_attr_set

The to_gov_attr_set() has been moved to the cpufreq.h, so use it to get
the gov_attr_set.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 26778884d9ab..cffcd08f4ec8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov);
 
 static void sugov_tunables_free(struct kobject *kobj)
 {
-	struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+	struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
 
 	kfree(to_sugov_tunables(attr_set));
 }
-- 
cgit v1.2.3


From e70e13e7d4ab8f932f49db1c9500b30a34a6d420 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@microsoft.com>
Date: Fri, 4 Feb 2022 01:55:18 +0100
Subject: bpf: Implement bpf_core_types_are_compat().

Adopt libbpf's bpf_core_types_are_compat() for kernel duty by adding
explicit recursion limit of 2 which is enough to handle 2 levels of
function prototypes.

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204005519.60361-2-mcroce@linux.microsoft.com
---
 kernel/bpf/btf.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9b47972c57a4..11740b300de9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6798,10 +6798,113 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 }
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+static
+int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+				const struct btf *targ_btf, __u32 targ_id,
+				int level)
+{
+	const struct btf_type *local_type, *targ_type;
+	int depth = 32; /* max recursion depth */
+
+	/* caller made sure that names match (ignoring flavor suffix) */
+	local_type = btf_type_by_id(local_btf, local_id);
+	targ_type = btf_type_by_id(targ_btf, targ_id);
+	if (btf_kind(local_type) != btf_kind(targ_type))
+		return 0;
+
+recur:
+	depth--;
+	if (depth < 0)
+		return -EINVAL;
+
+	local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
+	targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
+	if (!local_type || !targ_type)
+		return -EINVAL;
+
+	if (btf_kind(local_type) != btf_kind(targ_type))
+		return 0;
+
+	switch (btf_kind(local_type)) {
+	case BTF_KIND_UNKN:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+	case BTF_KIND_FWD:
+		return 1;
+	case BTF_KIND_INT:
+		/* just reject deprecated bitfield-like integers; all other
+		 * integers are by default compatible between each other
+		 */
+		return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
+	case BTF_KIND_PTR:
+		local_id = local_type->type;
+		targ_id = targ_type->type;
+		goto recur;
+	case BTF_KIND_ARRAY:
+		local_id = btf_array(local_type)->type;
+		targ_id = btf_array(targ_type)->type;
+		goto recur;
+	case BTF_KIND_FUNC_PROTO: {
+		struct btf_param *local_p = btf_params(local_type);
+		struct btf_param *targ_p = btf_params(targ_type);
+		__u16 local_vlen = btf_vlen(local_type);
+		__u16 targ_vlen = btf_vlen(targ_type);
+		int i, err;
+
+		if (local_vlen != targ_vlen)
+			return 0;
+
+		for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
+			if (level <= 0)
+				return -EINVAL;
+
+			btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
+			btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
+			err = __bpf_core_types_are_compat(local_btf, local_id,
+							  targ_btf, targ_id,
+							  level - 1);
+			if (err <= 0)
+				return err;
+		}
+
+		/* tail recurse for return type check */
+		btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
+		btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
+		goto recur;
+	}
+	default:
+		return 0;
+	}
+}
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ *   - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ *     kind should match for local and target types (i.e., STRUCT is not
+ *     compatible with UNION);
+ *   - for ENUMs, the size is ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ *   - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ *   - FUNC_PROTOs are compatible if they have compatible signature: same
+ *     number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
 int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
 			      const struct btf *targ_btf, __u32 targ_id)
 {
-	return -EOPNOTSUPP;
+	return __bpf_core_types_are_compat(local_btf, local_id,
+					   targ_btf, targ_id,
+					   MAX_TYPES_ARE_COMPAT_DEPTH);
 }
 
 static bool bpf_core_is_flavor_sep(const char *s)
-- 
cgit v1.2.3


From 2f293651eca3eacaeb56747dede31edace7329d2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 20 Dec 2021 16:38:02 +0000
Subject: livepatch: Fix build failure on 32 bits processors

Trying to build livepatch on powerpc/32 results in:

	kernel/livepatch/core.c: In function 'klp_resolve_symbols':
	kernel/livepatch/core.c:221:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
	  221 |                 sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
	      |                       ^
	kernel/livepatch/core.c:221:21: error: assignment to 'Elf32_Sym *' {aka 'struct elf32_sym *'} from incompatible pointer type 'Elf64_Sym *' {aka 'struct elf64_sym *'} [-Werror=incompatible-pointer-types]
	  221 |                 sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
	      |                     ^
	kernel/livepatch/core.c: In function 'klp_apply_section_relocs':
	kernel/livepatch/core.c:312:35: error: passing argument 1 of 'klp_resolve_symbols' from incompatible pointer type [-Werror=incompatible-pointer-types]
	  312 |         ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname);
	      |                                   ^~~~~~~
	      |                                   |
	      |                                   Elf32_Shdr * {aka struct elf32_shdr *}
	kernel/livepatch/core.c:193:44: note: expected 'Elf64_Shdr *' {aka 'struct elf64_shdr *'} but argument is of type 'Elf32_Shdr *' {aka 'struct elf32_shdr *'}
	  193 | static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
	      |                                ~~~~~~~~~~~~^~~~~~~

Fix it by using the right types instead of forcing 64 bits types.

Fixes: 7c8e2bdd5f0d ("livepatch: Apply vmlinux-specific KLP relocations early")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/5288e11b018a762ea3351cc8fb2d4f15093a4457.1640017960.git.christophe.leroy@csgroup.eu
---
 kernel/livepatch/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 585494ec464f..bc475e62279d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
 	return -EINVAL;
 }
 
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
 			       unsigned int symndx, Elf_Shdr *relasec,
 			       const char *sec_objname)
 {
@@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
 	relas = (Elf_Rela *) relasec->sh_addr;
 	/* For each rela in this klp relocation section */
 	for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
-		sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+		sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
 		if (sym->st_shndx != SHN_LIVEPATCH) {
 			pr_err("symbol %s is not marked as a livepatch symbol\n",
 			       strtab + sym->st_name);
-- 
cgit v1.2.3


From 3486bedd99196ecdfe99c0ab5b67ad3c47e8a8fa Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 4 Feb 2022 10:57:35 -0800
Subject: bpf: Use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem

This enables sub-page memory charge and allocation.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-3-song@kernel.org
---
 kernel/bpf/core.c       | 17 ++++++++---------
 kernel/bpf/trampoline.c |  6 +++---
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 04a8d5bea552..6ca0550c4b24 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void)
 }
 pure_initcall(bpf_jit_charge_init);
 
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
 {
-	if (atomic_long_add_return(pages, &bpf_jit_current) >
-	    (bpf_jit_limit >> PAGE_SHIFT)) {
+	if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
 		if (!bpf_capable()) {
-			atomic_long_sub(pages, &bpf_jit_current);
+			atomic_long_sub(size, &bpf_jit_current);
 			return -EPERM;
 		}
 	}
@@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages)
 	return 0;
 }
 
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
 {
-	atomic_long_sub(pages, &bpf_jit_current);
+	atomic_long_sub(size, &bpf_jit_current);
 }
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
 	pages = size / PAGE_SIZE;
 
-	if (bpf_jit_charge_modmem(pages))
+	if (bpf_jit_charge_modmem(size))
 		return NULL;
 	hdr = bpf_jit_alloc_exec(size);
 	if (!hdr) {
-		bpf_jit_uncharge_modmem(pages);
+		bpf_jit_uncharge_modmem(size);
 		return NULL;
 	}
 
@@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	u32 pages = hdr->pages;
 
 	bpf_jit_free_exec(hdr);
-	bpf_jit_uncharge_modmem(pages);
+	bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
 }
 
 /* This symbol is only overridden by archs that have different
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..e76a488c09c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
 	im = container_of(work, struct bpf_tramp_image, work);
 	bpf_image_ksym_del(&im->ksym);
 	bpf_jit_free_exec(im->image);
-	bpf_jit_uncharge_modmem(1);
+	bpf_jit_uncharge_modmem(PAGE_SIZE);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
 }
@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 	if (!im)
 		goto out;
 
-	err = bpf_jit_charge_modmem(1);
+	err = bpf_jit_charge_modmem(PAGE_SIZE);
 	if (err)
 		goto out_free_im;
 
@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 out_free_image:
 	bpf_jit_free_exec(im->image);
 out_uncharge:
-	bpf_jit_uncharge_modmem(1);
+	bpf_jit_uncharge_modmem(PAGE_SIZE);
 out_free_im:
 	kfree(im);
 out:
-- 
cgit v1.2.3


From ed2d9e1a26cca963ff5ed3b76326d70f7d8201a9 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 4 Feb 2022 10:57:36 -0800
Subject: bpf: Use size instead of pages in bpf_binary_header

This is necessary to charge sub page memory for the BPF program.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-4-song@kernel.org
---
 kernel/bpf/core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6ca0550c4b24..14199228a6f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 
 	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
-	prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
+	prog->aux->ksym.end   = addr + hdr->size;
 }
 
 static void
@@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_binary_header *hdr;
-	u32 size, hole, start, pages;
+	u32 size, hole, start;
 
 	WARN_ON_ONCE(!is_power_of_2(alignment) ||
 		     alignment > BPF_IMAGE_ALIGNMENT);
@@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	 * random section of illegal instructions.
 	 */
 	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
-	pages = size / PAGE_SIZE;
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
@@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	/* Fill space with illegal/arch-dep instructions. */
 	bpf_fill_ill_insns(hdr, size);
 
-	hdr->pages = pages;
+	hdr->size = size;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-	u32 pages = hdr->pages;
+	u32 size = hdr->size;
 
 	bpf_jit_free_exec(hdr);
-	bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
+	bpf_jit_uncharge_modmem(size);
 }
 
 /* This symbol is only overridden by archs that have different
-- 
cgit v1.2.3


From d00c6473b1ee9050cc36d008c6d30bf0d3de0524 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 4 Feb 2022 10:57:37 -0800
Subject: bpf: Use prog->jited_len in bpf_prog_ksym_set_addr()

Using prog->jited_len is simpler and more accurate than current
estimation (header + header->size).

Also, fix missing prog->jited_len with multi function program. This hasn't
been a real issue before this.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-5-song@kernel.org
---
 kernel/bpf/core.c     | 5 +----
 kernel/bpf/verifier.c | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 14199228a6f0..e3fe53df0a71 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -537,13 +537,10 @@ long bpf_jit_limit_max __read_mostly;
 static void
 bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 {
-	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
-	unsigned long addr = (unsigned long)hdr;
-
 	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 
 	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
-	prog->aux->ksym.end   = addr + hdr->size;
+	prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
 }
 
 static void
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ae41d0cf96c..bbef86cb4e72 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13067,6 +13067,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
+	prog->jited_len = func[0]->jited_len;
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt;
 	bpf_prog_jit_attempt_done(prog);
-- 
cgit v1.2.3


From ebc1415d9b4f043cef5a1fb002ec316e32167e7a Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 4 Feb 2022 10:57:39 -0800
Subject: bpf: Introduce bpf_arch_text_copy

This will be used to copy JITed text to RO protected module memory. On
x86, bpf_arch_text_copy is implemented with text_poke_copy.

bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno)
on errors.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-7-song@kernel.org
---
 kernel/bpf/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e3fe53df0a71..a5ec480f9862 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2440,6 +2440,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	return -ENOTSUPP;
 }
 
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
-- 
cgit v1.2.3


From 57631054fae6dcc9c892ae6310b58bbb6f6e5048 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 4 Feb 2022 10:57:40 -0800
Subject: bpf: Introduce bpf_prog_pack allocator

Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could add significant
pressure to instruction TLB. High iTLB pressure usually causes slow down
for the whole system, which includes visible performance degradation for
production workloads.

Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
page. The memory is then allocated in 64 byte chunks.

Memory allocated by bpf_prog_pack allocator is RO protected after initial
allocation. To write to it, the user (jit engine) need to use text poke
API.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-8-song@kernel.org
---
 kernel/bpf/core.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a5ec480f9862..7ae590897b73 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -805,6 +805,133 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 	return slot;
 }
 
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_CHUNK_SHIFT	6
+#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
+#define BPF_PROG_CHUNK_COUNT	(BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+	struct list_head list;
+	void *ptr;
+	unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+	struct bpf_prog_pack *pack;
+
+	pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+	if (!pack)
+		return NULL;
+	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+	if (!pack->ptr) {
+		kfree(pack);
+		return NULL;
+	}
+	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+	list_add_tail(&pack->list, &pack_list);
+
+	set_vm_flush_reset_perms(pack->ptr);
+	set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+	struct bpf_prog_pack *pack;
+	unsigned long pos;
+	void *ptr = NULL;
+
+	if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+		size = round_up(size, PAGE_SIZE);
+		ptr = module_alloc(size);
+		if (ptr) {
+			set_vm_flush_reset_perms(ptr);
+			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+		}
+		return ptr;
+	}
+	mutex_lock(&pack_mutex);
+	list_for_each_entry(pack, &pack_list, list) {
+		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+						 nbits, 0);
+		if (pos < BPF_PROG_CHUNK_COUNT)
+			goto found_free_area;
+	}
+
+	pack = alloc_new_pack();
+	if (!pack)
+		goto out;
+
+	pos = 0;
+
+found_free_area:
+	bitmap_set(pack->bitmap, pos, nbits);
+	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+	mutex_unlock(&pack_mutex);
+	return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+	struct bpf_prog_pack *pack = NULL, *tmp;
+	unsigned int nbits;
+	unsigned long pos;
+	void *pack_ptr;
+
+	if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+		module_memfree(hdr);
+		return;
+	}
+
+	pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+	mutex_lock(&pack_mutex);
+
+	list_for_each_entry(tmp, &pack_list, list) {
+		if (tmp->ptr == pack_ptr) {
+			pack = tmp;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+		goto out;
+
+	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+	bitmap_clear(pack->bitmap, pos, nbits);
+	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
+		list_del(&pack->list);
+		module_memfree(pack->ptr);
+		kfree(pack);
+	}
+out:
+	mutex_unlock(&pack_mutex);
+}
+
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
-- 
cgit v1.2.3


From 33c9805860e584b194199cab1a1e81f4e6395408 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 4 Feb 2022 10:57:41 -0800
Subject: bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]

This is the jit binary allocator built on top of bpf_prog_pack.

bpf_prog_pack allocates RO memory, which cannot be used directly by the
JIT engine. Therefore, a temporary rw buffer is allocated for the JIT
engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy
the program to the RO memory.

bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal
instructions, which is small than the 128 bytes space reserved by
bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr
to find the correct header. Also, flag use_bpf_prog_pack is added to
differentiate a program allocated by bpf_jit_binary_pack_alloc.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220204185742.271030-9-song@kernel.org
---
 kernel/bpf/core.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7ae590897b73..306aa63fa58e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(size);
 }
 
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+			  unsigned int alignment,
+			  struct bpf_binary_header **rw_header,
+			  u8 **rw_image,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+	struct bpf_binary_header *ro_header;
+	u32 size, hole, start;
+
+	WARN_ON_ONCE(!is_power_of_2(alignment) ||
+		     alignment > BPF_IMAGE_ALIGNMENT);
+
+	/* add 16 bytes for a random section of illegal instructions */
+	size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+	if (bpf_jit_charge_modmem(size))
+		return NULL;
+	ro_header = bpf_prog_pack_alloc(size);
+	if (!ro_header) {
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	*rw_header = kvmalloc(size, GFP_KERNEL);
+	if (!*rw_header) {
+		bpf_prog_pack_free(ro_header);
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	/* Fill space with illegal/arch-dep instructions. */
+	bpf_fill_ill_insns(*rw_header, size);
+	(*rw_header)->size = size;
+
+	hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+		     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+	start = (get_random_int() % hole) & ~(alignment - 1);
+
+	*image_ptr = &ro_header->image[start];
+	*rw_image = &(*rw_header)->image[start];
+
+	return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+				 struct bpf_binary_header *ro_header,
+				 struct bpf_binary_header *rw_header)
+{
+	void *ptr;
+
+	ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+	kvfree(rw_header);
+
+	if (IS_ERR(ptr)) {
+		bpf_prog_pack_free(ro_header);
+		return PTR_ERR(ptr);
+	}
+	prog->aux->use_bpf_prog_pack = true;
+	return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ *   1) when the program is freed after;
+ *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ * Also, ro_header->size in 2) is not properly set yet, so rw_header->size
+ * is used for uncharge.
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+			      struct bpf_binary_header *rw_header)
+{
+	u32 size = rw_header ? rw_header->size : ro_header->size;
+
+	bpf_prog_pack_free(ro_header);
+	kvfree(rw_header);
+	bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+	unsigned long real_start = (unsigned long)fp->bpf_func;
+	unsigned long addr;
+
+	if (fp->aux->use_bpf_prog_pack)
+		addr = real_start & BPF_PROG_CHUNK_MASK;
+	else
+		addr = real_start & PAGE_MASK;
+
+	return (void *)addr;
+}
+
 /* This symbol is only overridden by archs that have different
  * requirements than the usual eBPF JITs, f.e. when they only
  * implement cBPF JIT, do not set images read-only, etc.
@@ -1040,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
 	if (fp->jited) {
 		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
 
-		bpf_jit_binary_free(hdr);
+		if (fp->aux->use_bpf_prog_pack)
+			bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+		else
+			bpf_jit_binary_free(hdr);
 
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
 	}
-- 
cgit v1.2.3


From 82980b1622d97017053c6792382469d7dc26a486 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 16 Feb 2021 15:04:34 +0000
Subject: rcu: Kill rnp->ofl_seq and use only rcu_state.ofl_lock for exclusion

If we allow architectures to bring APs online in parallel, then we end
up requiring rcu_cpu_starting() to be reentrant. But currently, the
manipulation of rnp->ofl_seq is not thread-safe.

However, rnp->ofl_seq is also fairly much pointless anyway since both
rcu_cpu_starting() and rcu_report_dead() hold rcu_state.ofl_lock for
fairly much the whole time that rnp->ofl_seq is set to an odd number
to indicate that an operation is in progress.

So drop rnp->ofl_seq completely, and use only rcu_state.ofl_lock.

This has a couple of minor complexities: lockdep will complain when we
take rcu_state.ofl_lock, and currently accepts the 'excuse' of having
an odd value in rnp->ofl_seq. So switch it to an arch_spinlock_t to
avoid that false positive complaint. Since we're killing rnp->ofl_seq
of course that 'excuse' has to be changed too, so make it check for
arch_spin_is_locked(rcu_state.ofl_lock).

There's no arch_spin_lock_irqsave() so we have to manually save and
restore local interrupts around the locking.

At Paul's request based on Neeraj's analysis, make rcu_gp_init not just
wait but *exclude* any CPU online/offline activity, which was fairly
much true already by virtue of it holding rcu_state.ofl_lock.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 71 ++++++++++++++++++++++++++++---------------------------
 kernel/rcu/tree.h |  4 +---
 2 files changed, 37 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..73a4c9d07b86 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static struct rcu_state rcu_state = {
 	.abbr = RCU_ABBR,
 	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
 	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
-	.ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
 };
 
 /* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1175,7 +1175,15 @@ bool rcu_lockdep_current_cpu_online(void)
 	preempt_disable_notrace();
 	rdp = this_cpu_ptr(&rcu_data);
 	rnp = rdp->mynode;
-	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
+	/*
+	 * Strictly, we care here about the case where the current CPU is
+	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * not being up to date. So arch_spin_is_locked() might have a
+	 * false positive if it's held by some *other* CPU, but that's
+	 * OK because that just means a false *negative* on the warning.
+	 */
+	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) ||
+	    arch_spin_is_locked(&rcu_state.ofl_lock))
 		ret = true;
 	preempt_enable_notrace();
 	return ret;
@@ -1739,7 +1747,6 @@ static void rcu_strict_gp_boundary(void *unused)
  */
 static noinline_for_stack bool rcu_gp_init(void)
 {
-	unsigned long firstseq;
 	unsigned long flags;
 	unsigned long oldmask;
 	unsigned long mask;
@@ -1782,22 +1789,17 @@ static noinline_for_stack bool rcu_gp_init(void)
 	 * of RCU's Requirements documentation.
 	 */
 	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+	/* Exclude CPU hotplug operations. */
 	rcu_for_each_leaf_node(rnp) {
-		// Wait for CPU-hotplug operations that might have
-		// started before this grace period did.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
-		firstseq = READ_ONCE(rnp->ofl_seq);
-		if (firstseq & 0x1)
-			while (firstseq == READ_ONCE(rnp->ofl_seq))
-				schedule_timeout_idle(1);  // Can't wake unless RCU is watching.
-		smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
-		raw_spin_lock(&rcu_state.ofl_lock);
-		raw_spin_lock_irq_rcu_node(rnp);
+		local_irq_save(flags);
+		arch_spin_lock(&rcu_state.ofl_lock);
+		raw_spin_lock_rcu_node(rnp);
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
-			raw_spin_unlock_irq_rcu_node(rnp);
-			raw_spin_unlock(&rcu_state.ofl_lock);
+			raw_spin_unlock_rcu_node(rnp);
+			arch_spin_unlock(&rcu_state.ofl_lock);
+			local_irq_restore(flags);
 			continue;
 		}
 
@@ -1832,8 +1834,9 @@ static noinline_for_stack bool rcu_gp_init(void)
 				rcu_cleanup_dead_rnp(rnp);
 		}
 
-		raw_spin_unlock_irq_rcu_node(rnp);
-		raw_spin_unlock(&rcu_state.ofl_lock);
+		raw_spin_unlock_rcu_node(rnp);
+		arch_spin_unlock(&rcu_state.ofl_lock);
+		local_irq_restore(flags);
 	}
 	rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
 
@@ -4287,11 +4290,10 @@ void rcu_cpu_starting(unsigned int cpu)
 
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+	local_irq_save(flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
 	rcu_dynticks_eqs_online();
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	raw_spin_lock_rcu_node(rnp);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
 	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
@@ -4304,15 +4306,18 @@ void rcu_cpu_starting(unsigned int cpu)
 
 	/* An incoming CPU should never be blocking a grace period. */
 	if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+		/* rcu_report_qs_rnp() *really* wants some flags to restore */
+		unsigned long flags2;
+
+		local_irq_save(flags2);
 		rcu_disable_urgency_upon_qs(rdp);
 		/* Report QS -after- changing ->qsmaskinitnext! */
-		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+		rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
 	} else {
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+		raw_spin_unlock_rcu_node(rnp);
 	}
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(flags);
 	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 
@@ -4326,7 +4331,7 @@ void rcu_cpu_starting(unsigned int cpu)
  */
 void rcu_report_dead(unsigned int cpu)
 {
-	unsigned long flags;
+	unsigned long flags, seq_flags;
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
@@ -4340,10 +4345,8 @@ void rcu_report_dead(unsigned int cpu)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	raw_spin_lock(&rcu_state.ofl_lock);
+	local_irq_save(seq_flags);
+	arch_spin_lock(&rcu_state.ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
 	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -4354,10 +4357,8 @@ void rcu_report_dead(unsigned int cpu)
 	}
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-	raw_spin_unlock(&rcu_state.ofl_lock);
-	smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
-	WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
-	WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+	arch_spin_unlock(&rcu_state.ofl_lock);
+	local_irq_restore(seq_flags);
 
 	rdp->cpu_started = false;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 486fc901bd08..4b4bcef8a974 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,8 +56,6 @@ struct rcu_node {
 				/*  Initialized from ->qsmaskinitnext at the */
 				/*  beginning of each grace period. */
 	unsigned long qsmaskinitnext;
-	unsigned long ofl_seq;	/* CPU-hotplug operation sequence count. */
-				/* Online CPUs for next grace period. */
 	unsigned long expmask;	/* CPUs or groups that need to check in */
 				/*  to allow the current expedited GP */
 				/*  to complete. */
@@ -355,7 +353,7 @@ struct rcu_state {
 	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 
-	raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+	arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
 						/* Synchronize offline with */
 						/*  GP pre-initialization. */
 };
-- 
cgit v1.2.3


From 0cabb47af3cfaeb6007ba3868379bbd4daee64cc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 10 Dec 2021 16:25:20 -0800
Subject: rcu: Refactor rcu_barrier() empty-list handling

This commit saves a few lines by checking first for an empty callback
list.  If the callback list is empty, then that CPU is taken care of,
regardless of its online or nocb state.  Also simplify tracing accordingly
and fold a few lines together.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 73a4c9d07b86..57a7a0065750 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4030,8 +4030,7 @@ void rcu_barrier(void)
 
 	/* Did someone else do our work for us? */
 	if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
-		rcu_barrier_trace(TPS("EarlyExit"), -1,
-				  rcu_state.barrier_sequence);
+		rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
 		smp_mb(); /* caller's subsequent code after above check. */
 		mutex_unlock(&rcu_state.barrier_mutex);
 		return;
@@ -4059,26 +4058,18 @@ void rcu_barrier(void)
 	 */
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(&rcu_data, cpu);
-		if (cpu_is_offline(cpu) &&
-		    !rcu_rdp_is_offloaded(rdp))
+		if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+			rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
 			continue;
-		if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
-			rcu_barrier_trace(TPS("OnlineQ"), cpu,
-					  rcu_state.barrier_sequence);
+		}
+		if (cpu_online(cpu)) {
+			rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
 			smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
-		} else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
-			   cpu_is_offline(cpu)) {
-			rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
-					  rcu_state.barrier_sequence);
+		} else {
+			rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
 			local_irq_disable();
 			rcu_barrier_func((void *)cpu);
 			local_irq_enable();
-		} else if (cpu_is_offline(cpu)) {
-			rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
-					  rcu_state.barrier_sequence);
-		} else {
-			rcu_barrier_trace(TPS("OnlineNQ"), cpu,
-					  rcu_state.barrier_sequence);
 		}
 	}
 	cpus_read_unlock();
-- 
cgit v1.2.3


From a16578dd5e3a44b53ca0699ac2971679dab97484 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 14 Dec 2021 13:15:18 -0800
Subject: rcu: Rework rcu_barrier() and callback-migration logic

This commit reworks rcu_barrier() and callback-migration logic to
permit allowing rcu_barrier() to run concurrently with CPU-hotplug
operations.  The key trick is for callback migration to check to see if
an rcu_barrier() is in flight, and, if so, enqueue the ->barrier_head
callback on its behalf.

This commit adds synchronization with RCU's CPU-hotplug notifiers.  Taken
together, this will permit a later commit to remove the cpus_read_lock()
and cpus_read_unlock() calls from rcu_barrier().

[ paulmck: Updated per kbuild test robot feedback. ]
[ paulmck: Updated per reviews session with Neeraj, Frederic, Uladzislau, and Boqun. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 77 +++++++++++++++++++++++++++++++++++++++++++------------
 kernel/rcu/tree.h |  2 ++
 2 files changed, 63 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 57a7a0065750..004ff1c0d192 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3987,13 +3987,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 }
 
 /*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
  */
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
 {
-	uintptr_t cpu = (uintptr_t)cpu_in;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
 
+	lockdep_assert_held(&rdp->barrier_lock);
+	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+		return;
 	rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
@@ -4003,10 +4006,26 @@ static void rcu_barrier_func(void *cpu_in)
 		atomic_inc(&rcu_state.barrier_cpu_count);
 	} else {
 		debug_rcu_head_unqueue(&rdp->barrier_head);
-		rcu_barrier_trace(TPS("IRQNQ"), -1,
-				  rcu_state.barrier_sequence);
+		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
 	}
 	rcu_nocb_unlock(rdp);
+	smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+	uintptr_t cpu = (uintptr_t)cpu_in;
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+	lockdep_assert_irqs_disabled();
+	WARN_ON_ONCE(cpu != rdp->cpu);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+	raw_spin_lock(&rdp->barrier_lock);
+	rcu_barrier_entrain(rdp);
+	raw_spin_unlock(&rdp->barrier_lock);
 }
 
 /**
@@ -4020,6 +4039,8 @@ static void rcu_barrier_func(void *cpu_in)
 void rcu_barrier(void)
 {
 	uintptr_t cpu;
+	unsigned long flags;
+	unsigned long gseq;
 	struct rcu_data *rdp;
 	unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
 
@@ -4038,6 +4059,7 @@ void rcu_barrier(void)
 
 	/* Mark the start of the barrier operation. */
 	rcu_seq_start(&rcu_state.barrier_sequence);
+	gseq = rcu_state.barrier_sequence;
 	rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
 
 	/*
@@ -4058,19 +4080,30 @@ void rcu_barrier(void)
 	 */
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(&rcu_data, cpu);
+retry:
+		if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
+			continue;
+		raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
 		if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+			WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+			raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
 			rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
 			continue;
 		}
-		if (cpu_online(cpu)) {
-			rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
-			smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
-		} else {
+		if (!rcu_rdp_cpu_online(rdp)) {
+			rcu_barrier_entrain(rdp);
+			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+			raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
 			rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
-			local_irq_disable();
-			rcu_barrier_func((void *)cpu);
-			local_irq_enable();
+			continue;
 		}
+		raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+		if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+			schedule_timeout_uninterruptible(1);
+			goto retry;
+		}
+		WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+		rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
 	}
 	cpus_read_unlock();
 
@@ -4087,6 +4120,12 @@ void rcu_barrier(void)
 	/* Mark the end of the barrier operation. */
 	rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
 	rcu_seq_end(&rcu_state.barrier_sequence);
+	gseq = rcu_state.barrier_sequence;
+	for_each_possible_cpu(cpu) {
+		rdp = per_cpu_ptr(&rcu_data, cpu);
+
+		WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+	}
 
 	/* Other rcu_barrier() invocations can now safely proceed. */
 	mutex_unlock(&rcu_state.barrier_mutex);
@@ -4134,6 +4173,8 @@ rcu_boot_init_percpu_data(int cpu)
 	INIT_WORK(&rdp->strict_work, strict_work_handler);
 	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
 	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+	raw_spin_lock_init(&rdp->barrier_lock);
+	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
 	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
 	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
 	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
@@ -4284,8 +4325,10 @@ void rcu_cpu_starting(unsigned int cpu)
 	local_irq_save(flags);
 	arch_spin_lock(&rcu_state.ofl_lock);
 	rcu_dynticks_eqs_online();
+	raw_spin_lock(&rdp->barrier_lock);
 	raw_spin_lock_rcu_node(rnp);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+	raw_spin_unlock(&rdp->barrier_lock);
 	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
 	/* Allow lockless access for expedited grace periods. */
@@ -4372,7 +4415,9 @@ void rcutree_migrate_callbacks(int cpu)
 	    rcu_segcblist_empty(&rdp->cblist))
 		return;  /* No callbacks to migrate. */
 
-	local_irq_save(flags);
+	raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
+	WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+	rcu_barrier_entrain(rdp);
 	my_rdp = this_cpu_ptr(&rcu_data);
 	my_rnp = my_rdp->mynode;
 	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
@@ -4382,10 +4427,10 @@ void rcutree_migrate_callbacks(int cpu)
 	needwake = rcu_advance_cbs(my_rnp, rdp) ||
 		   rcu_advance_cbs(my_rnp, my_rdp);
 	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+	raw_spin_unlock(&rdp->barrier_lock); /* irqs remain disabled. */
 	needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
 	rcu_segcblist_disable(&rdp->cblist);
-	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
-		     !rcu_segcblist_n_cbs(&my_rdp->cblist));
+	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
 	if (rcu_rdp_is_offloaded(my_rdp)) {
 		raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
 		__call_rcu_nocb_wake(my_rdp, true, flags);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4b4bcef8a974..84362951ed9e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -188,6 +188,8 @@ struct rcu_data {
 	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
 
 	/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+	raw_spinlock_t barrier_lock;	/* Protects ->barrier_seq_snap. */
+	unsigned long barrier_seq_snap;	/* Snap of rcu_state.barrier_sequence. */
 	struct rcu_head barrier_head;
 	int exp_dynticks_snap;		/* Double-check need for IPI. */
 
-- 
cgit v1.2.3


From 80b3fd474c91b3ecfd845b4a0bfb58706b877ba5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 14 Dec 2021 13:35:17 -0800
Subject: rcu: Make rcu_barrier() no longer block CPU-hotplug operations

This commit removes the cpus_read_lock() and cpus_read_unlock() calls
from rcu_barrier(), thus allowing CPUs to come and go during the course
of rcu_barrier() execution.  Posting of the ->barrier_head callbacks does
synchronize with portions of RCU's CPU-hotplug notifiers, but these locks
are held for short time periods on both sides.  Thus, full CPU-hotplug
operations could both start and finish during the execution of a given
rcu_barrier() invocation.

Additional synchronization is provided by a global ->barrier_lock.
Since the ->barrier_lock is only used during rcu_barrier() execution and
during onlining/offlining a CPU, the contention for this lock should
be low.  It might be tempting to make use of a per-CPU lock just on
general principles, but straightforward attempts to do this have the
problems shown below.

Initial state: 3 CPUs present, CPU 0 and CPU1 do not have
any callback and CPU2 has callbacks.

1. CPU0 calls rcu_barrier().

2. CPU1 starts offlining for CPU2. CPU1 calls
   rcutree_migrate_callbacks(). rcu_barrier_entrain() is called
   from rcutree_migrate_callbacks(), with CPU2's rdp->barrier_lock.
   It does not entrain ->barrier_head for CPU2, as rcu_barrier()
   on CPU0 hasn't started the barrier sequence (by calling
   rcu_seq_start(&rcu_state.barrier_sequence)) yet.

3. CPU0 starts new barrier sequence. It iterates over
   CPU0 and CPU1, after acquiring their per-cpu ->barrier_lock
   and finds 0 segcblist length. It updates ->barrier_seq_snap
   for CPU0 and CPU1 and continues loop iteration to CPU2.

    for_each_possible_cpu(cpu) {
        raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
        if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
            WRITE_ONCE(rdp->barrier_seq_snap, gseq);
            raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
            rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
            continue;
        }

4. rcutree_migrate_callbacks() completes execution on CPU1.
   Segcblist len for CPU2 becomes 0.

5. The loop iteration on CPU0, checks rcu_segcblist_n_cbs(&rdp->cblist)
   for CPU2 and completes the loop iteration after setting
   ->barrier_seq_snap.

6. As there isn't any ->barrier_head callback entrained; at
   this point, rcu_barrier() in CPU0 returns.

7. The callbacks, which migrated from CPU2 to CPU1, execute.

Straightforward per-CPU locking is also subject to the following race
condition noted by Boqun Feng:

1. CPU0 calls rcu_barrier(), starting a new barrier sequence by invoking
   rcu_seq_start() and init_completion(), but does not yet initialize
   rcu_state.barrier_cpu_count.

2. CPU1 starts offlining for CPU2, calling rcutree_migrate_callbacks(),
   which in turn calls rcu_barrier_entrain() holding CPU2's.
   rdp->barrier_lock.  It then entrains ->barrier_head for CPU2
   and atomically increments rcu_state.barrier_cpu_count, which is
   unfortunately not yet initialized to the value 2.

3. The just-entrained RCU callback is invoked.  It atomically
   decrements rcu_state.barrier_cpu_count and sees that it is
   now zero.  This callback therefore invokes complete().

4. CPU0 continues executing rcu_barrier(), but is not blocked
   by its call to wait_for_completion().  This results in rcu_barrier()
   returning before all pre-existing callbacks have been invoked,
   which is a bug.

Therefore, synchronization is provided by rcu_state.barrier_lock,
which is also held across the initialization sequence, especially the
rcu_seq_start() and the atomic_set() that sets rcu_state.barrier_cpu_count
to the value 2.  In addition, this lock is held when entraining the
rcu_barrier() callback, when deciding whether or not a CPU has callbacks
that rcu_barrier() must wait on, when setting the ->qsmaskinitnext for
incoming CPUs, and when migrating callbacks from a CPU that is going
offline.

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 28 ++++++++++++++--------------
 kernel/rcu/tree.h |  3 ++-
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 004ff1c0d192..2d70b91e3fbc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -87,6 +87,7 @@ static struct rcu_state rcu_state = {
 	.gp_state = RCU_GP_IDLE,
 	.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
 	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+	.barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
 	.name = RCU_NAME,
 	.abbr = RCU_ABBR,
 	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
@@ -3994,7 +3995,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
 	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
 
-	lockdep_assert_held(&rdp->barrier_lock);
+	lockdep_assert_held(&rcu_state.barrier_lock);
 	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
 		return;
 	rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
@@ -4023,9 +4024,9 @@ static void rcu_barrier_handler(void *cpu_in)
 	lockdep_assert_irqs_disabled();
 	WARN_ON_ONCE(cpu != rdp->cpu);
 	WARN_ON_ONCE(cpu != smp_processor_id());
-	raw_spin_lock(&rdp->barrier_lock);
+	raw_spin_lock(&rcu_state.barrier_lock);
 	rcu_barrier_entrain(rdp);
-	raw_spin_unlock(&rdp->barrier_lock);
+	raw_spin_unlock(&rcu_state.barrier_lock);
 }
 
 /**
@@ -4058,6 +4059,7 @@ void rcu_barrier(void)
 	}
 
 	/* Mark the start of the barrier operation. */
+	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
 	rcu_seq_start(&rcu_state.barrier_sequence);
 	gseq = rcu_state.barrier_sequence;
 	rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
@@ -4071,7 +4073,7 @@ void rcu_barrier(void)
 	 */
 	init_completion(&rcu_state.barrier_completion);
 	atomic_set(&rcu_state.barrier_cpu_count, 2);
-	cpus_read_lock();
+	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
 
 	/*
 	 * Force each CPU with callbacks to register a new callback.
@@ -4083,21 +4085,21 @@ void rcu_barrier(void)
 retry:
 		if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
 			continue;
-		raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
+		raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
 		if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
 			WRITE_ONCE(rdp->barrier_seq_snap, gseq);
-			raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
 			rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
 			continue;
 		}
 		if (!rcu_rdp_cpu_online(rdp)) {
 			rcu_barrier_entrain(rdp);
 			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
-			raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
 			rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
 			continue;
 		}
-		raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+		raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
 		if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
 			schedule_timeout_uninterruptible(1);
 			goto retry;
@@ -4105,7 +4107,6 @@ retry:
 		WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
 		rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
 	}
-	cpus_read_unlock();
 
 	/*
 	 * Now that we have an rcu_barrier_callback() callback on each
@@ -4173,7 +4174,6 @@ rcu_boot_init_percpu_data(int cpu)
 	INIT_WORK(&rdp->strict_work, strict_work_handler);
 	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
 	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
-	raw_spin_lock_init(&rdp->barrier_lock);
 	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
 	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
 	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
@@ -4325,10 +4325,10 @@ void rcu_cpu_starting(unsigned int cpu)
 	local_irq_save(flags);
 	arch_spin_lock(&rcu_state.ofl_lock);
 	rcu_dynticks_eqs_online();
-	raw_spin_lock(&rdp->barrier_lock);
+	raw_spin_lock(&rcu_state.barrier_lock);
 	raw_spin_lock_rcu_node(rnp);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
-	raw_spin_unlock(&rdp->barrier_lock);
+	raw_spin_unlock(&rcu_state.barrier_lock);
 	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
 	/* Allow lockless access for expedited grace periods. */
@@ -4415,7 +4415,7 @@ void rcutree_migrate_callbacks(int cpu)
 	    rcu_segcblist_empty(&rdp->cblist))
 		return;  /* No callbacks to migrate. */
 
-	raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
+	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
 	WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
 	rcu_barrier_entrain(rdp);
 	my_rdp = this_cpu_ptr(&rcu_data);
@@ -4427,7 +4427,7 @@ void rcutree_migrate_callbacks(int cpu)
 	needwake = rcu_advance_cbs(my_rnp, rdp) ||
 		   rcu_advance_cbs(my_rnp, my_rdp);
 	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
-	raw_spin_unlock(&rdp->barrier_lock); /* irqs remain disabled. */
+	raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
 	needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
 	rcu_segcblist_disable(&rdp->cblist);
 	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 84362951ed9e..a2d7ffd634cc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -188,7 +188,6 @@ struct rcu_data {
 	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
 
 	/* 4) rcu_barrier(), OOM callbacks, and expediting. */
-	raw_spinlock_t barrier_lock;	/* Protects ->barrier_seq_snap. */
 	unsigned long barrier_seq_snap;	/* Snap of rcu_state.barrier_sequence. */
 	struct rcu_head barrier_head;
 	int exp_dynticks_snap;		/* Double-check need for IPI. */
@@ -323,6 +322,8 @@ struct rcu_state {
 						/*  rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	raw_spinlock_t barrier_lock;		/* Protects ->barrier_seq_snap. */
+
 	struct mutex exp_mutex;			/* Serialize expedited GP. */
 	struct mutex exp_wake_mutex;		/* Serialize wakeup. */
 	unsigned long expedited_sequence;	/* Take a ticket. */
-- 
cgit v1.2.3


From 5ae0f1b58b28b53f4ab3708ef9337a2665e79664 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 10 Dec 2021 13:44:17 -0800
Subject: rcu: Create and use an rcu_rdp_cpu_online()

The pattern "rdp->grpmask & rcu_rnp_online_cpus(rnp)" occurs frequently
in RCU code in order to determine whether rdp->cpu is online from an
RCU perspective.  This commit therefore creates an rcu_rdp_cpu_online()
function to replace it.

[ paulmck: Apply kernel test robot unused-variable feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        | 21 +++++++++++++--------
 kernel/rcu/tree_plugin.h |  6 ++----
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2d70b91e3fbc..1d3507d563db 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -222,6 +222,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
 	return READ_ONCE(rnp->qsmaskinitnext);
 }
 
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective?  This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
 /*
  * Return true if an RCU grace period is in progress.  The READ_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -1168,14 +1178,12 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
 bool rcu_lockdep_current_cpu_online(void)
 {
 	struct rcu_data *rdp;
-	struct rcu_node *rnp;
 	bool ret = false;
 
 	if (in_nmi() || !rcu_scheduler_fully_active)
 		return true;
 	preempt_disable_notrace();
 	rdp = this_cpu_ptr(&rcu_data);
-	rnp = rdp->mynode;
 	/*
 	 * Strictly, we care here about the case where the current CPU is
 	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
@@ -1183,8 +1191,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	 * false positive if it's held by some *other* CPU, but that's
 	 * OK because that just means a false *negative* on the warning.
 	 */
-	if (rdp->grpmask & rcu_rnp_online_cpus(rnp) ||
-	    arch_spin_is_locked(&rcu_state.ofl_lock))
+	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
 		ret = true;
 	preempt_enable_notrace();
 	return ret;
@@ -1269,8 +1276,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * For more detail, please refer to the "Hotplug CPU" section
 	 * of RCU's Requirements documentation.
 	 */
-	if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
-		bool onl;
+	if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
 		struct rcu_node *rnp1;
 
 		pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
@@ -1279,9 +1285,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
 			pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
 				__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
-		onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
 		pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
-			__func__, rdp->cpu, ".o"[onl],
+			__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
 			(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
 			(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
 		return 1; /* Break things loose after complaining. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..d3db2168598e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt)
 		 * then queue the task as required based on the states
 		 * of any ongoing and expedited grace periods.
 		 */
-		WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+		WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 		trace_rcu_preempt_task(rcu_state.name,
 				       t->pid,
@@ -773,7 +773,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 	int cpu;
 	int i;
 	struct list_head *lhp;
-	bool onl;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp1;
 
@@ -797,9 +796,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 	pr_cont("\n");
 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
 		rdp = per_cpu_ptr(&rcu_data, cpu);
-		onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
 		pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
-			cpu, ".o"[onl],
+			cpu, ".o"[rcu_rdp_cpu_online(rdp)],
 			(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
 			(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
 	}
-- 
cgit v1.2.3


From 2bcd18e041fc3c2ae58f41eb5e18790c7c82c674 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Feb 2022 15:42:36 -0800
Subject: rcu-tasks: Use order_base_2() instead of ilog2()

The ilog2() function can be used to generate a shift count, but it will
generate the same count for a power of two as for one greater than a power
of two.  This results in shift counts that are larger than necessary for
systems with a power-of-two number of CPUs because the CPUs are numbered
from zero, so that the maximum CPU number is one less than that power
of two.

This commit therefore substitutes order_base_2(), which appears to have
been designed for exactly this use case.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d64f0b1d8cd3..670c75cbcb98 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -123,7 +123,7 @@ static struct rcu_tasks rt_name =							\
 	.call_func = call,								\
 	.rtpcpu = &rt_name ## __percpu,							\
 	.name = n,									\
-	.percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS) + 1,				\
+	.percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS),				\
 	.percpu_enqueue_lim = 1,							\
 	.percpu_dequeue_lim = 1,							\
 	.barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex),		\
@@ -302,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 	if (unlikely(needadjust)) {
 		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
 		if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
-			WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1);
+			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
 			WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
 			smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
 			pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
@@ -417,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 	if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
 		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
 		if (rtp->percpu_enqueue_lim > 1) {
-			WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids) + 1);
+			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
 			smp_store_release(&rtp->percpu_enqueue_lim, 1);
 			rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
 			pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
-- 
cgit v1.2.3


From 00a8b4b54cd69d9f7ba1730d3b266469a778b1d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Feb 2022 16:34:40 -0800
Subject: rcu-tasks: Set ->percpu_enqueue_shift to zero upon contention

Currently, call_rcu_tasks_generic() sets ->percpu_enqueue_shift to
order_base_2(nr_cpu_ids) upon encountering sufficient contention.
This does not shift to use of non-CPU-0 callback queues as intended, but
rather continues using only CPU 0's queue.  Although this does provide
some decrease in contention due to spreading work over multiple locks,
it is not the dramatic decrease that was intended.

This commit therefore makes call_rcu_tasks_generic() set
->percpu_enqueue_shift to 0.

Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 670c75cbcb98..ac17348187e4 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -302,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 	if (unlikely(needadjust)) {
 		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
 		if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
-			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+			WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
 			WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
 			smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
 			pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
-- 
cgit v1.2.3


From c1b13a9451ab9d46eefb80a2cc4b8b3206460829 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 8 Feb 2022 14:05:09 -0800
Subject: bpf: Fix bpf_prog_pack build HPAGE_PMD_SIZE

Fix build with CONFIG_TRANSPARENT_HUGEPAGE=n with BPF_PROG_PACK_SIZE as
PAGE_SIZE.

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220208220509.4180389-3-song@kernel.org
---
 kernel/bpf/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 306aa63fa58e..42d96549a804 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -814,7 +814,11 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
  * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
  * to host BPF programs.
  */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
+#else
+#define BPF_PROG_PACK_SIZE	PAGE_SIZE
+#endif
 #define BPF_PROG_CHUNK_SHIFT	6
 #define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
 #define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
@@ -826,7 +830,7 @@ struct bpf_prog_pack {
 	unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
 };
 
-#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_MAX_PACK_PROG_SIZE	BPF_PROG_PACK_SIZE
 #define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
 
 static DEFINE_MUTEX(pack_mutex);
-- 
cgit v1.2.3


From 1f8863bfb5ca500ea1c7669b16b1931ba27fce20 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 1 Feb 2022 12:02:59 +0000
Subject: genirq: Allow the PM device to originate from irq domain

As a preparation to moving the reference to the device used for
runtime power management, add a new 'dev' field to the irqdomain
structure for that exact purpose.

The irq_chip_pm_{get,put}() helpers are made aware of the dual
location via a new private helper.

No functional change intended.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Tony Lindgren <tony@atomide.com>
Acked-by: Bartosz Golaszewski <brgl@bgdev.pl>
Link: https://lore.kernel.org/r/20220201120310.878267-2-maz@kernel.org
---
 kernel/irq/chip.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c09324663088..a2a12cdbe872 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1558,6 +1558,17 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	return 0;
 }
 
+static struct device *irq_get_parent_device(struct irq_data *data)
+{
+	if (data->chip->parent_device)
+		return data->chip->parent_device;
+
+	if (data->domain)
+		return data->domain->dev;
+
+	return NULL;
+}
+
 /**
  * irq_chip_pm_get - Enable power for an IRQ chip
  * @data:	Pointer to interrupt specific data
@@ -1567,12 +1578,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
  */
 int irq_chip_pm_get(struct irq_data *data)
 {
+	struct device *dev = irq_get_parent_device(data);
 	int retval;
 
-	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
-		retval = pm_runtime_get_sync(data->chip->parent_device);
+	if (IS_ENABLED(CONFIG_PM) && dev) {
+		retval = pm_runtime_get_sync(dev);
 		if (retval < 0) {
-			pm_runtime_put_noidle(data->chip->parent_device);
+			pm_runtime_put_noidle(dev);
 			return retval;
 		}
 	}
@@ -1590,10 +1602,11 @@ int irq_chip_pm_get(struct irq_data *data)
  */
 int irq_chip_pm_put(struct irq_data *data)
 {
+	struct device *dev = irq_get_parent_device(data);
 	int retval = 0;
 
-	if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
-		retval = pm_runtime_put(data->chip->parent_device);
+	if (IS_ENABLED(CONFIG_PM) && dev)
+		retval = pm_runtime_put(dev);
 
 	return (retval < 0) ? retval : 0;
 }
-- 
cgit v1.2.3


From beb0622138cd2848dec06b0651a988c39d099574 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 1 Feb 2022 12:03:10 +0000
Subject: genirq: Kill irq_chip::parent_device

Now that noone is using irq_chip::parent_device in the tree, get
rid of it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Bartosz Golaszewski <brgl@bgdev.pl>
Link: https://lore.kernel.org/r/20220201120310.878267-13-maz@kernel.org
---
 kernel/irq/chip.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2a12cdbe872..24b6f2b40e5e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1560,9 +1560,6 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 
 static struct device *irq_get_parent_device(struct irq_data *data)
 {
-	if (data->chip->parent_device)
-		return data->chip->parent_device;
-
 	if (data->domain)
 		return data->domain->dev;
 
-- 
cgit v1.2.3


From b1d18a7574d0df5eb4117c14742baf8bc2b9bb74 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 9 Feb 2022 15:19:57 -0800
Subject: bpf: Extend sys_bpf commands for bpf_syscall programs.

bpf_sycall programs can be used directly by the kernel modules
to load programs and create maps via kernel skeleton.
. Export bpf_sys_bpf syscall wrapper to be used in kernel skeleton.
. Export bpf_map_get to be used in kernel skeleton.
. Allow prog_run cmd for bpf_syscall programs with recursion check.
. Enable link_create and raw_tp_open cmds.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220209232001.27490-2-alexei.starovoitov@gmail.com
---
 kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 72ce1edde950..49f88b30662a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -985,6 +985,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
 
 	return map;
 }
+EXPORT_SYMBOL(bpf_map_get);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
@@ -4756,23 +4757,52 @@ static bool syscall_prog_is_valid_access(int off, int size,
 	return true;
 }
 
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
 {
+	struct bpf_prog * __maybe_unused prog;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 	case BPF_MAP_UPDATE_ELEM:
 	case BPF_MAP_FREEZE:
 	case BPF_PROG_LOAD:
 	case BPF_BTF_LOAD:
+	case BPF_LINK_CREATE:
+	case BPF_RAW_TRACEPOINT_OPEN:
 		break;
-	/* case BPF_PROG_TEST_RUN:
-	 * is not part of this list to prevent recursive test_run
-	 */
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+	case BPF_PROG_TEST_RUN:
+		if (attr->test.data_in || attr->test.data_out ||
+		    attr->test.ctx_out || attr->test.duration ||
+		    attr->test.repeat || attr->test.flags)
+			return -EINVAL;
+
+		prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+		    attr->test.ctx_size_in > U16_MAX) {
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
+
+		if (!__bpf_prog_enter_sleepable(prog)) {
+			/* recursion detected */
+			bpf_prog_put(prog);
+			return -EBUSY;
+		}
+		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+		bpf_prog_put(prog);
+		return 0;
+#endif
 	default:
 		return -EINVAL;
 	}
 	return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
 }
+EXPORT_SYMBOL(bpf_sys_bpf);
 
 static const struct bpf_func_proto bpf_sys_bpf_proto = {
 	.func		= bpf_sys_bpf,
-- 
cgit v1.2.3


From d7beb3d6aba39480590b30c502fbaa2cc1e5e30b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 9 Feb 2022 15:20:00 -0800
Subject: bpf: Update iterators.lskel.h.

Light skeleton and skel_internal.h have changed.
Update iterators.lskel.h.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220209232001.27490-5-alexei.starovoitov@gmail.com
---
 kernel/bpf/preload/iterators/iterators.lskel.h | 141 ++++++++++++-------------
 1 file changed, 69 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
index d90562d672d2..70f236a82fe1 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel.h
@@ -3,8 +3,6 @@
 #ifndef __ITERATORS_BPF_SKEL_H__
 #define __ITERATORS_BPF_SKEL_H__
 
-#include <stdlib.h>
-#include <bpf/bpf.h>
 #include <bpf/skel_internal.h>
 
 struct iterators_bpf {
@@ -70,31 +68,28 @@ iterators_bpf__destroy(struct iterators_bpf *skel)
 	iterators_bpf__detach(skel);
 	skel_closenz(skel->progs.dump_bpf_map.prog_fd);
 	skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
-	munmap(skel->rodata, 4096);
+	skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
 	skel_closenz(skel->maps.rodata.map_fd);
-	free(skel);
+	skel_free(skel);
 }
 static inline struct iterators_bpf *
 iterators_bpf__open(void)
 {
 	struct iterators_bpf *skel;
 
-	skel = calloc(sizeof(*skel), 1);
+	skel = skel_alloc(sizeof(*skel));
 	if (!skel)
 		goto cleanup;
 	skel->ctx.sz = (void *)&skel->links - (void *)skel;
-	skel->rodata =
-		mmap(NULL, 4096, PROT_READ | PROT_WRITE,
-		     MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-	if (skel->rodata == (void *) -1)
-		goto cleanup;
-	memcpy(skel->rodata, (void *)"\
+	skel->rodata = skel_prep_map_data((void *)"\
 \x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
 \x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
 \x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
 \x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
-\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 98);
-	skel->maps.rodata.initial_value = (__u64)(long)skel->rodata;
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
+	if (!skel->rodata)
+		goto cleanup;
+	skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
 	return skel;
 cleanup:
 	iterators_bpf__destroy(skel);
@@ -326,7 +321,7 @@ iterators_bpf__load(struct iterators_bpf *skel)
 \0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
 \0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
 \0\0\0\0";
-	opts.insns_sz = 2184;
+	opts.insns_sz = 2216;
 	opts.insns = (void *)"\
 \xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
 \0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
@@ -343,70 +338,72 @@ iterators_bpf__load(struct iterators_bpf *skel)
 \0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
 \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
 \x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
-\0\0\0\x61\x60\x20\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
 \x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
 \0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
 \xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
-\0\x79\x63\x18\0\0\0\0\0\x15\x03\x04\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
-\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x85\0\0\0\x94\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\x01\0\
-\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\0\0\
-\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\xa6\0\
-\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xa3\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\x01\0\0\
-\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\xb7\x03\
-\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x96\xff\0\0\0\0\
-\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x11\0\
-\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\x61\0\0\0\0\
-\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x40\
-\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
-\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\0\x7b\x01\
-\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\
-\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\
-\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\
-\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\
-\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\
-\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\x0c\0\0\
-\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x60\xff\
-\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\x77\x07\
-\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\
-\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\
-\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\
-\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4e\xff\0\0\0\0\x63\
-\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\0\0\0\0\
-\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x18\x12\
-\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
-\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\x7b\x01\0\0\
-\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\0\0\x61\
-\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\0\0\0\0\
-\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\0\0\0\0\
-\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\x01\0\0\
-\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\0\x63\
-\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\0\0\0\
-\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\
-\0\0\xc5\x07\x17\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\x70\x6c\
-\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\
-\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\
-\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\0\0\0\0\
-\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x05\
-\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\x02\0\0\0\
-\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\x63\x06\
-\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\0\0\0\0\
-\x95\0\0\0\0\0\0\0";
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
+\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
+\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
+\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
+\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
+\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
+\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
+\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
+\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
+\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
+\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
+\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
+\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
+\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
+\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
+\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
+\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
+\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0";
 	err = bpf_load_and_run(&opts);
 	if (err < 0)
 		return err;
-	skel->rodata =
-		mmap(skel->rodata, 4096, PROT_READ, MAP_SHARED | MAP_FIXED,
-			skel->maps.rodata.map_fd, 0);
+	skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
+					4096, PROT_READ, skel->maps.rodata.map_fd);
+	if (!skel->rodata)
+		return -ENOMEM;
 	return 0;
 }
 
-- 
cgit v1.2.3


From cb80ddc67152e72f28ff6ea8517acdf875d7381d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 9 Feb 2022 15:20:01 -0800
Subject: bpf: Convert bpf_preload.ko to use light skeleton.

The main change is a move of the single line
  #include "iterators.lskel.h"
from iterators/iterators.c to bpf_preload_kern.c.
Which means that generated light skeleton can be used from user space or
user mode driver like iterators.c or from the kernel module or the kernel itself.
The direct use of light skeleton from the kernel module simplifies the code,
since UMD is no longer necessary. The libbpf.a required user space and UMD. The
CO-RE in the kernel and generated "loader bpf program" used by the light
skeleton are capable to perform complex loading operations traditionally
provided by libbpf. In addition UMD approach was launching UMD process
every time bpffs has to be mounted. With light skeleton in the kernel
the bpf_preload kernel module loads bpf iterators once and pins them
multiple times into different bpffs mounts.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220209232001.27490-6-alexei.starovoitov@gmail.com
---
 kernel/bpf/inode.c                                |  39 ++-----
 kernel/bpf/preload/Kconfig                        |   7 +-
 kernel/bpf/preload/Makefile                       |  14 +--
 kernel/bpf/preload/bpf_preload.h                  |   8 +-
 kernel/bpf/preload/bpf_preload_kern.c             | 119 +++++++++-------------
 kernel/bpf/preload/bpf_preload_umd_blob.S         |   7 --
 kernel/bpf/preload/iterators/bpf_preload_common.h |  13 ---
 kernel/bpf/preload/iterators/iterators.c          | 108 --------------------
 kernel/bpf/syscall.c                              |   2 +
 9 files changed, 70 insertions(+), 247 deletions(-)
 delete mode 100644 kernel/bpf/preload/bpf_preload_umd_blob.S
 delete mode 100644 kernel/bpf/preload/iterators/bpf_preload_common.h
 delete mode 100644 kernel/bpf/preload/iterators/iterators.c

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8d9f7467bf..4f841e16779e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -710,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
 static int populate_bpffs(struct dentry *parent)
 {
 	struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
-	struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
 	int err = 0, i;
 
 	/* grab the mutex to make sure the kernel interactions with bpf_preload
-	 * UMD are serialized
+	 * are serialized
 	 */
 	mutex_lock(&bpf_preload_lock);
 
@@ -722,40 +721,22 @@ static int populate_bpffs(struct dentry *parent)
 	if (!bpf_preload_mod_get())
 		goto out;
 
-	if (!bpf_preload_ops->info.tgid) {
-		/* preload() will start UMD that will load BPF iterator programs */
-		err = bpf_preload_ops->preload(objs);
-		if (err)
+	err = bpf_preload_ops->preload(objs);
+	if (err)
+		goto out_put;
+	for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+		bpf_link_inc(objs[i].link);
+		err = bpf_iter_link_pin_kernel(parent,
+					       objs[i].link_name, objs[i].link);
+		if (err) {
+			bpf_link_put(objs[i].link);
 			goto out_put;
-		for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
-			links[i] = bpf_link_by_id(objs[i].link_id);
-			if (IS_ERR(links[i])) {
-				err = PTR_ERR(links[i]);
-				goto out_put;
-			}
 		}
-		for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
-			err = bpf_iter_link_pin_kernel(parent,
-						       objs[i].link_name, links[i]);
-			if (err)
-				goto out_put;
-			/* do not unlink successfully pinned links even
-			 * if later link fails to pin
-			 */
-			links[i] = NULL;
-		}
-		/* finish() will tell UMD process to exit */
-		err = bpf_preload_ops->finish();
-		if (err)
-			goto out_put;
 	}
 out_put:
 	bpf_preload_mod_put();
 out:
 	mutex_unlock(&bpf_preload_lock);
-	for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
-		if (!IS_ERR_OR_NULL(links[i]))
-			bpf_link_put(links[i]);
 	return err;
 }
 
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..c9d45c9d6918 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD
 
 if BPF_PRELOAD
 config BPF_PRELOAD_UMD
-	tristate "bpf_preload kernel module with user mode driver"
-	depends on CC_CAN_LINK
-	depends on m || CC_CAN_LINK_STATIC
+	tristate "bpf_preload kernel module"
 	default m
 	help
-	  This builds bpf_preload kernel module with embedded user mode driver.
+	  This builds bpf_preload kernel module with embedded BPF programs for
+	  introspection in bpffs.
 endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index baf47d9c7557..167534e3b0b4 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -3,16 +3,6 @@
 LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
 LIBBPF_INCLUDE = $(LIBBPF_SRCS)/..
 
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
-	-I $(LIBBPF_INCLUDE) -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-bpf_preload_umd-objs := iterators/iterators.o
-
-$(obj)/bpf_preload_umd:
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
-
 obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I $(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
 #ifndef _BPF_PRELOAD_H
 #define _BPF_PRELOAD_H
 
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+	char link_name[16];
+	struct bpf_link *link;
+};
 
 struct bpf_preload_ops {
-        struct umd_info info;
 	int (*preload)(struct bpf_preload_info *);
-	int (*finish)(void);
 	struct module *owner;
 };
 extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..30207c048d36 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,80 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
 #include "bpf_preload.h"
+#include "iterators/iterators.lskel.h"
 
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
 
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+	if (!IS_ERR_OR_NULL(maps_link))
+		bpf_link_put(maps_link);
+	if (!IS_ERR_OR_NULL(progs_link))
+		bpf_link_put(progs_link);
+	iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+	strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+	obj[0].link = maps_link;
+	strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+	obj[1].link = progs_link;
+	return 0;
+}
 
-static struct bpf_preload_ops umd_ops = {
-	.info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
 	.preload = preload,
-	.finish = finish,
 	.owner = THIS_MODULE,
 };
 
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
 {
-	int magic = BPF_PRELOAD_START;
-	loff_t pos = 0;
-	int i, err;
-	ssize_t n;
+	int err;
 
-	err = fork_usermode_driver(&umd_ops.info);
+	skel = iterators_bpf__open();
+	if (!skel)
+		return -ENOMEM;
+	err = iterators_bpf__load(skel);
 	if (err)
-		return err;
-
-	/* send the start magic to let UMD proceed with loading BPF progs */
-	n = kernel_write(umd_ops.info.pipe_to_umh,
-			 &magic, sizeof(magic), &pos);
-	if (n != sizeof(magic))
-		return -EPIPE;
-
-	/* receive bpf_link IDs and names from UMD */
-	pos = 0;
-	for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
-		n = kernel_read(umd_ops.info.pipe_from_umh,
-				&obj[i], sizeof(*obj), &pos);
-		if (n != sizeof(*obj))
-			return -EPIPE;
+		goto out;
+	err = iterators_bpf__attach(skel);
+	if (err)
+		goto out;
+	maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+	if (IS_ERR(maps_link)) {
+		err = PTR_ERR(maps_link);
+		goto out;
 	}
-	return 0;
-}
-
-static int finish(void)
-{
-	int magic = BPF_PRELOAD_END;
-	struct pid *tgid;
-	loff_t pos = 0;
-	ssize_t n;
-
-	/* send the last magic to UMD. It will do a normal exit. */
-	n = kernel_write(umd_ops.info.pipe_to_umh,
-			 &magic, sizeof(magic), &pos);
-	if (n != sizeof(magic))
-		return -EPIPE;
-
-	tgid = umd_ops.info.tgid;
-	if (tgid) {
-		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
-		umd_cleanup_helper(&umd_ops.info);
+	progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+	if (IS_ERR(progs_link)) {
+		err = PTR_ERR(progs_link);
+		goto out;
 	}
 	return 0;
+out:
+	free_links_and_skel();
+	return err;
 }
 
-static int __init load_umd(void)
+static int __init load(void)
 {
 	int err;
 
-	err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
-			    &bpf_preload_umd_end - &bpf_preload_umd_start);
+	err = load_skel();
 	if (err)
 		return err;
-	bpf_preload_ops = &umd_ops;
+	bpf_preload_ops = &ops;
 	return err;
 }
 
-static void __exit fini_umd(void)
+static void __exit fini(void)
 {
-	struct pid *tgid;
-
 	bpf_preload_ops = NULL;
-
-	/* kill UMD in case it's still there due to earlier error */
-	tgid = umd_ops.info.tgid;
-	if (tgid) {
-		kill_pid(tgid, SIGKILL, 1);
-
-		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
-		umd_cleanup_helper(&umd_ops.info);
-	}
-	umd_unload_blob(&umd_ops.info);
+	free_links_and_skel();
 }
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
 MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.section .init.rodata, "a"
-	.global bpf_preload_umd_start
-bpf_preload_umd_start:
-	.incbin "kernel/bpf/preload/bpf_preload_umd"
-	.global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
-	char link_name[16];
-	int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 4dafe0f4f2b2..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.lskel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int __bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len)
-{
-	union bpf_attr attr;
-	int err;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.info.bpf_fd = bpf_fd;
-	attr.info.info_len = *info_len;
-	attr.info.info = (long) info;
-
-	err = skel_sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
-	if (!err)
-		*info_len = attr.info.info_len;
-	return err;
-}
-
-static int send_link_to_kernel(int link_fd, const char *link_name)
-{
-	struct bpf_preload_info obj = {};
-	struct bpf_link_info info = {};
-	__u32 info_len = sizeof(info);
-	int err;
-
-	err = __bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
-	if (err)
-		return err;
-	obj.link_id = info.id;
-	if (strlen(link_name) >= sizeof(obj.link_name))
-		return -E2BIG;
-	strcpy(obj.link_name, link_name);
-	if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
-		return -EPIPE;
-	return 0;
-}
-
-int main(int argc, char **argv)
-{
-	struct iterators_bpf *skel;
-	int err, magic;
-	int debug_fd;
-
-	debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
-	if (debug_fd < 0)
-		return 1;
-	to_kernel = dup(1);
-	close(1);
-	dup(debug_fd);
-	/* now stdin and stderr point to /dev/console */
-
-	read(from_kernel, &magic, sizeof(magic));
-	if (magic != BPF_PRELOAD_START) {
-		printf("bad start magic %d\n", magic);
-		return 1;
-	}
-	/* libbpf opens BPF object and loads it into the kernel */
-	skel = iterators_bpf__open_and_load();
-	if (!skel) {
-		/* iterators.skel.h is little endian.
-		 * libbpf doesn't support automatic little->big conversion
-		 * of BPF bytecode yet.
-		 * The program load will fail in such case.
-		 */
-		printf("Failed load could be due to wrong endianness\n");
-		return 1;
-	}
-	err = iterators_bpf__attach(skel);
-	if (err)
-		goto cleanup;
-
-	/* send two bpf_link IDs with names to the kernel */
-	err = send_link_to_kernel(skel->links.dump_bpf_map_fd, "maps.debug");
-	if (err)
-		goto cleanup;
-	err = send_link_to_kernel(skel->links.dump_bpf_prog_fd, "progs.debug");
-	if (err)
-		goto cleanup;
-
-	/* The kernel will proceed with pinnging the links in bpffs.
-	 * UMD will wait on read from pipe.
-	 */
-	read(from_kernel, &magic, sizeof(magic));
-	if (magic != BPF_PRELOAD_END) {
-		printf("bad final magic %d\n", magic);
-		err = -EINVAL;
-	}
-cleanup:
-	iterators_bpf__destroy(skel);
-
-	return err != 0;
-}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 49f88b30662a..35646db3d950 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2490,6 +2490,7 @@ void bpf_link_put(struct bpf_link *link)
 		bpf_link_free(link);
 	}
 }
+EXPORT_SYMBOL(bpf_link_put);
 
 static int bpf_link_release(struct inode *inode, struct file *filp)
 {
@@ -2632,6 +2633,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 
 	return link;
 }
+EXPORT_SYMBOL(bpf_link_get_from_fd);
 
 struct bpf_tracing_link {
 	struct bpf_link link;
-- 
cgit v1.2.3


From 4cc0991abd3954609a6929234bbb8c0fe7a0298d Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 10 Feb 2022 18:49:39 -0800
Subject: bpf: Fix bpf_prog_pack build for ppc64_defconfig

bpf_prog_pack causes build error with powerpc ppc64_defconfig:

kernel/bpf/core.c:830:23: error: variably modified 'bitmap' at file scope
  830 |         unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
      |                       ^~~~~~

This is because the marco expands as:

unsigned long bitmap[((((((1UL) << (16 + __pte_index_size)) / (1 << 6))) \
     + ((sizeof(long) * 8)) - 1) / ((sizeof(long) * 8)))];

where __pte_index_size is a global variable.

Fix it by turning bitmap into a 0-length array.

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220211024939.2962537-1-song@kernel.org
---
 kernel/bpf/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 42d96549a804..44623c9b5bb1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 struct bpf_prog_pack {
 	struct list_head list;
 	void *ptr;
-	unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+	unsigned long bitmap[];
 };
 
 #define BPF_PROG_MAX_PACK_PROG_SIZE	BPF_PROG_PACK_SIZE
@@ -840,7 +840,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 {
 	struct bpf_prog_pack *pack;
 
-	pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+	pack = kzalloc(sizeof(*pack) + BITS_TO_BYTES(BPF_PROG_CHUNK_COUNT), GFP_KERNEL);
 	if (!pack)
 		return NULL;
 	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
-- 
cgit v1.2.3


From 1581a884b7ca5592270caa010a910f2ed4f7b5f5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Thu, 27 Jan 2022 15:44:19 -0600
Subject: tracing: Remove size restriction on tracing_log_err cmd strings

Currently, tracing_log_err.cmd strings are restricted to a length of
MAX_FILTER_STR_VAL (256), which is too short for some commands already
seen in the wild (with cmd strings longer than that showing up
truncated).

Remove the restriction so that no command string is ever truncated.

Link: https://lkml.kernel.org/r/ca965f23256b350ebd94b3dc1a319f28e8267f5f.1643319703.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 55 +++++++++++++++++++++++++++++++++++++++-------------
 kernel/trace/trace.h |  2 +-
 2 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7c2578efde26..7c85ce9ffdc3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7723,7 +7723,7 @@ const struct file_operations trace_min_max_fops = {
 struct err_info {
 	const char	**errs;	/* ptr to loc-specific array of err strings */
 	u8		type;	/* index into errs -> specific err string */
-	u8		pos;	/* MAX_FILTER_STR_VAL = 256 */
+	u16		pos;	/* caret position */
 	u64		ts;
 };
 
@@ -7731,26 +7731,52 @@ struct tracing_log_err {
 	struct list_head	list;
 	struct err_info		info;
 	char			loc[TRACING_LOG_LOC_MAX]; /* err location */
-	char			cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+	char			*cmd;                     /* what caused err */
 };
 
 static DEFINE_MUTEX(tracing_err_log_lock);
 
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+	struct tracing_log_err *err;
+
+	err = kzalloc(sizeof(*err), GFP_KERNEL);
+	if (!err)
+		return ERR_PTR(-ENOMEM);
+
+	err->cmd = kzalloc(len, GFP_KERNEL);
+	if (!err->cmd) {
+		kfree(err);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+	kfree(err->cmd);
+	kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+						   int len)
 {
 	struct tracing_log_err *err;
 
 	if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
-		err = kzalloc(sizeof(*err), GFP_KERNEL);
-		if (!err)
-			err = ERR_PTR(-ENOMEM);
-		else
+		err = alloc_tracing_log_err(len);
+		if (PTR_ERR(err) != -ENOMEM)
 			tr->n_err_log_entries++;
 
 		return err;
 	}
 
 	err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+	kfree(err->cmd);
+	err->cmd = kzalloc(len, GFP_KERNEL);
+	if (!err->cmd)
+		return ERR_PTR(-ENOMEM);
 	list_del(&err->list);
 
 	return err;
@@ -7811,22 +7837,25 @@ unsigned int err_pos(char *cmd, const char *str)
  */
 void tracing_log_err(struct trace_array *tr,
 		     const char *loc, const char *cmd,
-		     const char **errs, u8 type, u8 pos)
+		     const char **errs, u8 type, u16 pos)
 {
 	struct tracing_log_err *err;
+	int len = 0;
 
 	if (!tr)
 		tr = &global_trace;
 
+	len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
 	mutex_lock(&tracing_err_log_lock);
-	err = get_tracing_log_err(tr);
+	err = get_tracing_log_err(tr, len);
 	if (PTR_ERR(err) == -ENOMEM) {
 		mutex_unlock(&tracing_err_log_lock);
 		return;
 	}
 
 	snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
-	snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+	snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
 
 	err->info.errs = errs;
 	err->info.type = type;
@@ -7844,7 +7873,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
 	mutex_lock(&tracing_err_log_lock);
 	list_for_each_entry_safe(err, next, &tr->err_log, list) {
 		list_del(&err->list);
-		kfree(err);
+		free_tracing_log_err(err);
 	}
 
 	tr->n_err_log_entries = 0;
@@ -7872,9 +7901,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
 	mutex_unlock(&tracing_err_log_lock);
 }
 
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
 {
-	u8 i;
+	u16 i;
 
 	for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
 		seq_putc(m, ' ');
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d038ddbf1bea..0f5e22238cd2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1878,7 +1878,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
 extern unsigned int err_pos(char *cmd, const char *str);
 extern void tracing_log_err(struct trace_array *tr,
 			    const char *loc, const char *cmd,
-			    const char **errs, u8 type, u8 pos);
+			    const char **errs, u8 type, u16 pos);
 
 /*
  * Normal trace_printk() and friends allocates special buffers
-- 
cgit v1.2.3


From edfeed318d59ff242e895bf906223fc0b915117c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 28 Jan 2022 14:08:26 -0600
Subject: tracing: Remove size restriction on hist trigger cmd error logging

Currently, hist trigger command error strings are restricted to a
length of MAX_FILTER_STR_VAL (256), which is too short for some
commands already seen in the wild (with cmd strings longer than that
showing up truncated in err_log).

Remove the restriction so that no hist trigger command error string is
ever truncated.

Link: https://lkml.kernel.org/r/0f9d46407222eaf6632cd3b417bc50a11f401b71.1643399022.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ada87bfb5bb8..5e8970624bce 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
 	return data;
 }
 
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
 static char last_cmd_loc[MAX_FILTER_STR_VAL];
 
 static int errpos(char *str)
 {
+	if (!str || !last_cmd)
+		return 0;
+
 	return err_pos(last_cmd, str);
 }
 
@@ -739,12 +744,19 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 {
 	const char *system = NULL, *name = NULL;
 	struct trace_event_call *call;
+	int len = 0;
 
 	if (!str)
 		return;
 
-	strcpy(last_cmd, "hist:");
-	strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+	len += sizeof(HIST_PREFIX) + strlen(str) + 1;
+	kfree(last_cmd);
+	last_cmd = kzalloc(len, GFP_KERNEL);
+	if (!last_cmd)
+		return;
+
+	strcpy(last_cmd, HIST_PREFIX);
+	strncat(last_cmd, str, len - sizeof(HIST_PREFIX));
 
 	if (file) {
 		call = file->event_call;
@@ -757,18 +769,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 	}
 
 	if (system)
-		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+		snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
 }
 
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
 {
+	if (!last_cmd)
+		return;
+
 	tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
 			err_type, err_pos);
 }
 
 static void hist_err_clear(void)
 {
-	last_cmd[0] = '\0';
+	if (last_cmd)
+		last_cmd[0] = '\0';
 	last_cmd_loc[0] = '\0';
 }
 
@@ -5610,7 +5626,7 @@ static int event_hist_trigger_print(struct seq_file *m,
 	bool have_var = false;
 	unsigned int i;
 
-	seq_puts(m, "hist:");
+	seq_puts(m, HIST_PREFIX);
 
 	if (data->name)
 		seq_printf(m, "%s:", data->name);
-- 
cgit v1.2.3


From 27c888da9867725784bad3d6455d6e53b425fa2b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 28 Jan 2022 14:08:27 -0600
Subject: tracing: Remove size restriction on synthetic event cmd error logging

Currently, synthetic event command error strings are restricted to a
length of MAX_FILTER_STR_VAL (256), which is too short for some
commands already seen in the wild (with cmd strings longer than that
showing up truncated in err_log).

Remove the restriction so that no synthetic event command error string
is ever truncated.

Link: https://lkml.kernel.org/r/0376692396a81d0b795127c66ea92ca5bf60f481.1643399022.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..fdd79e07e2fc 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
 
 static const char *err_text[] = { ERRORS };
 
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
 
 static int errpos(const char *str)
 {
+	if (!str || !last_cmd)
+		return 0;
+
 	return err_pos(last_cmd, str);
 }
 
@@ -54,11 +57,19 @@ static void last_cmd_set(const char *str)
 	if (!str)
 		return;
 
-	strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+	kfree(last_cmd);
+	last_cmd = kzalloc(strlen(str) + 1, GFP_KERNEL);
+	if (!last_cmd)
+		return;
+
+	strncpy(last_cmd, str, strlen(str) + 1);
 }
 
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
 {
+	if (!last_cmd)
+		return;
+
 	tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
 			err_type, err_pos);
 }
-- 
cgit v1.2.3


From 55bc8384d3deadce48923d8329c1434494c52273 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 10 Feb 2022 19:36:05 -0500
Subject: tracing: Save both wakee and current on wakeup events

Use the sched_switch function to save both the wakee and the waker comms
in the saved cmdlines list when sched_wakeup is done.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_switch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..5cd33be2031b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -44,7 +44,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
 
 	if (!flags)
 		return;
-	tracing_record_taskinfo(current, flags);
+	tracing_record_taskinfo_sched_switch(current, wakee, flags);
 }
 
 static int tracing_sched_register(void)
-- 
cgit v1.2.3


From 7f5a08c79df35e68f1a43033450c5050f12bc155 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:15 -0800
Subject: user_events: Add minimal support for trace_event into ftrace

Minimal support for interacting with dynamic events, trace_event and
ftrace. Core outline of flow between user process, ioctl and trace_event
APIs.

User mode processes that wish to use trace events to get data into
ftrace, perf, eBPF, etc are limited to uprobes today. The user events
features enables an ABI for user mode processes to create and write to
trace events that are isolated from kernel level trace events. This
enables a faster path for tracing from user mode data as well as opens
managed code to participate in trace events, where stub locations are
dynamic.

User processes often want to trace only when it's useful. To enable this
a set of pages are mapped into the user process space that indicate the
current state of the user events that have been registered. User
processes can check if their event is hooked to a trace/probe, and if it
is, emit the event data out via the write() syscall.

Two new files are introduced into tracefs to accomplish this:
user_events_status - This file is mmap'd into participating user mode
processes to indicate event status.

user_events_data - This file is opened and register/delete ioctl's are
issued to create/open/delete trace events that can be used for tracing.

The typical scenario is on process start to mmap user_events_status. Processes
then register the events they plan to use via the REG ioctl. The ioctl reads
and updates the passed in user_reg struct. The status_index of the struct is
used to know the byte in the status page to check for that event. The
write_index of the struct is used to describe that event when writing out to
the fd that was used for the ioctl call. The data must always include this
index first when writing out data for an event. Data can be written either by
write() or by writev().

For example, in memory:
int index;
char data[];

Psuedo code example of typical usage:
struct user_reg reg;

int page_fd = open("user_events_status", O_RDWR);
char *page_data = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, page_fd, 0);
close(page_fd);

int data_fd = open("user_events_data", O_RDWR);

reg.size = sizeof(reg);
reg.name_args = (__u64)"test";

ioctl(data_fd, DIAG_IOCSREG, &reg);
int status_id = reg.status_index;
int write_id = reg.write_index;

struct iovec io[2];
io[0].iov_base = &write_id;
io[0].iov_len = sizeof(write_id);
io[1].iov_base = payload;
io[1].iov_len = sizeof(payload);

if (page_data[status_id])
	writev(data_fd, io, 2);

User events are also exposed via the dynamic_events tracefs file for
both create and delete. Current status is exposed via the user_events_status
tracefs file.

Simple example to register a user event via dynamic_events:
	echo u:test >> dynamic_events
	cat dynamic_events
	u:test

If an event is hooked to a probe, the probe hooked shows up:
	echo 1 > events/user_events/test/enable
	cat user_events_status
	1:test # Used by ftrace

	Active: 1
	Busy: 1
	Max: 4096

If an event is not hooked to a probe, no probe status shows up:
	echo 0 > events/user_events/test/enable
	cat user_events_status
	1:test

	Active: 1
	Busy: 0
	Max: 4096

Users can describe the trace event format via the following format:
	name[:FLAG1[,FLAG2...] [field1[;field2...]]

Each field has the following format:
	type name

Example for char array with a size of 20 named msg:
	echo 'u:detailed char[20] msg' >> dynamic_events
	cat dynamic_events
	u:detailed char[20] msg

Data offsets are based on the data written out via write() and will be
updated to reflect the correct offset in the trace_event fields. For dynamic
data it is recommended to use the new __rel_loc data type. This type will be
the same as __data_loc, but the offset is relative to this entry. This allows
user_events to not worry about what common fields are being inserted before
the data.

The above format is valid for both the ioctl and the dynamic_events file.

Link: https://lkml.kernel.org/r/20220118204326.2169-2-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig             |   14 +
 kernel/trace/Makefile            |    1 +
 kernel/trace/trace_events_user.c | 1187 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 1202 insertions(+)
 create mode 100644 kernel/trace/trace_events_user.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..16a52a71732d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -737,6 +737,20 @@ config SYNTH_EVENTS
 
 	  If in doubt, say N.
 
+config USER_EVENTS
+	bool "User trace events"
+	select TRACING
+	select DYNAMIC_EVENTS
+	help
+	  User trace events are user-defined trace events that
+	  can be used like an existing kernel trace event.  User trace
+	  events are generated by writing to a tracefs file.  User
+	  processes can determine if their tracing events should be
+	  generated by memory mapping a tracefs file and checking for
+	  an associated byte being non-zero.
+
+	  If in doubt, say N.
+
 config HIST_TRIGGERS
 	bool "Histogram triggers"
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..19ef3758da95 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
 obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
 obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
 obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..77105233115e
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/user_events.h>
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be multiple of PAGE_SIZE.
+ */
+#define MAX_PAGES 1
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+	struct tracepoint tracepoint;
+	struct trace_event_call call;
+	struct trace_event_class class;
+	struct dyn_event devent;
+	struct hlist_node node;
+	struct list_head fields;
+	atomic_t refcnt;
+	int index;
+	int flags;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+	struct rcu_head rcu;
+	int count;
+	struct user_event *events[];
+};
+
+typedef void (*user_event_func_t) (struct user_event *user,
+				   void *data, u32 datalen,
+				   void *tpdata);
+
+static int user_event_parse(char *name, char *args, char *flags,
+			    struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+	return jhash(name, strlen(name), 0);
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+	struct user_event *user = (struct user_event *)call->data;
+
+	return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+	char *name = raw_command;
+	char *args = strpbrk(name, " ");
+	char *flags;
+
+	if (args)
+		*args++ = '\0';
+
+	flags = strpbrk(name, ":");
+
+	if (flags)
+		*flags++ = '\0';
+
+	return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+	const char *start = strchr(type, '[');
+	char val[8];
+	char *bracket;
+	int size = 0;
+
+	if (start == NULL)
+		return -EINVAL;
+
+	if (strscpy(val, start + 1, sizeof(val)) <= 0)
+		return -EINVAL;
+
+	bracket = strchr(val, ']');
+
+	if (!bracket)
+		return -EINVAL;
+
+	*bracket = '\0';
+
+	if (kstrtouint(val, 0, &size))
+		return -EINVAL;
+
+	if (size > MAX_FIELD_ARRAY_SIZE)
+		return -EINVAL;
+
+	return size;
+}
+
+static int user_field_size(const char *type)
+{
+	/* long is not allowed from a user, since it's ambigious in size */
+	if (strcmp(type, "s64") == 0)
+		return sizeof(s64);
+	if (strcmp(type, "u64") == 0)
+		return sizeof(u64);
+	if (strcmp(type, "s32") == 0)
+		return sizeof(s32);
+	if (strcmp(type, "u32") == 0)
+		return sizeof(u32);
+	if (strcmp(type, "int") == 0)
+		return sizeof(int);
+	if (strcmp(type, "unsigned int") == 0)
+		return sizeof(unsigned int);
+	if (strcmp(type, "s16") == 0)
+		return sizeof(s16);
+	if (strcmp(type, "u16") == 0)
+		return sizeof(u16);
+	if (strcmp(type, "short") == 0)
+		return sizeof(short);
+	if (strcmp(type, "unsigned short") == 0)
+		return sizeof(unsigned short);
+	if (strcmp(type, "s8") == 0)
+		return sizeof(s8);
+	if (strcmp(type, "u8") == 0)
+		return sizeof(u8);
+	if (strcmp(type, "char") == 0)
+		return sizeof(char);
+	if (strcmp(type, "unsigned char") == 0)
+		return sizeof(unsigned char);
+	if (str_has_prefix(type, "char["))
+		return user_field_array_size(type);
+	if (str_has_prefix(type, "unsigned char["))
+		return user_field_array_size(type);
+	if (str_has_prefix(type, "__data_loc "))
+		return sizeof(u32);
+	if (str_has_prefix(type, "__rel_loc "))
+		return sizeof(u32);
+
+	/* Uknown basic type, error */
+	return -EINVAL;
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+	struct ftrace_event_field *field, *next;
+	struct list_head *head = &user->fields;
+
+	list_for_each_entry_safe(field, next, head, link) {
+		list_del(&field->link);
+		kfree(field);
+	}
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+				const char *name, int offset, int size,
+				int is_signed, int filter_type)
+{
+	struct ftrace_event_field *field;
+
+	field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+	if (!field)
+		return -ENOMEM;
+
+	field->type = type;
+	field->name = name;
+	field->offset = offset;
+	field->size = size;
+	field->is_signed = is_signed;
+	field->filter_type = filter_type;
+
+	list_add(&field->link, &user->fields);
+
+	return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+				  u32 *offset)
+{
+	char *part, *type, *name;
+	u32 depth = 0, saved_offset = *offset;
+	int len, size = -EINVAL;
+	bool is_struct = false;
+
+	field = skip_spaces(field);
+
+	if (*field == '\0')
+		return 0;
+
+	/* Handle types that have a space within */
+	len = str_has_prefix(field, "unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "struct ");
+	if (len) {
+		is_struct = true;
+		goto skip_next;
+	}
+
+	len = str_has_prefix(field, "__data_loc unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__data_loc ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__rel_loc unsigned ");
+	if (len)
+		goto skip_next;
+
+	len = str_has_prefix(field, "__rel_loc ");
+	if (len)
+		goto skip_next;
+
+	goto parse;
+skip_next:
+	type = field;
+	field = strpbrk(field + len, " ");
+
+	if (field == NULL)
+		return -EINVAL;
+
+	*field++ = '\0';
+	depth++;
+parse:
+	while ((part = strsep(&field, " ")) != NULL) {
+		switch (depth++) {
+		case FIELD_DEPTH_TYPE:
+			type = part;
+			break;
+		case FIELD_DEPTH_NAME:
+			name = part;
+			break;
+		case FIELD_DEPTH_SIZE:
+			if (!is_struct)
+				return -EINVAL;
+
+			if (kstrtou32(part, 10, &size))
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	if (depth < FIELD_DEPTH_SIZE)
+		return -EINVAL;
+
+	if (depth == FIELD_DEPTH_SIZE)
+		size = user_field_size(type);
+
+	if (size == 0)
+		return -EINVAL;
+
+	if (size < 0)
+		return size;
+
+	*offset = saved_offset + size;
+
+	return user_event_add_field(user, type, name, saved_offset, size,
+				    type[0] != 'u', FILTER_OTHER);
+}
+
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+	char *flag;
+
+	if (flags == NULL)
+		return;
+
+	while ((flag = strsep(&flags, ",")) != NULL) {
+		if (strcmp(flag, "BPF_ITER") == 0)
+			user->flags |= FLAG_BPF_ITER;
+	}
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+	char *field;
+	u32 offset = sizeof(struct trace_entry);
+	int ret = -EINVAL;
+
+	if (args == NULL)
+		return 0;
+
+	while ((field = strsep(&args, ";")) != NULL) {
+		ret = user_event_parse_field(field, user, &offset);
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+						int flags,
+						struct trace_event *event)
+{
+	/* Unsafe to try to decode user provided print_fmt, use hex */
+	trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+				 1, iter->ent, iter->ent_size, true);
+
+	return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+	.trace = user_event_print_trace,
+};
+
+static int destroy_user_event(struct user_event *user)
+{
+	int ret = 0;
+
+	/* Must destroy fields before call removal */
+	user_event_destroy_fields(user);
+
+	ret = trace_remove_event_call(&user->call);
+
+	if (ret)
+		return ret;
+
+	dyn_event_remove(&user->devent);
+
+	register_page_data[user->index] = 0;
+	clear_bit(user->index, page_bitmap);
+	hash_del(&user->node);
+
+	kfree(EVENT_NAME(user));
+	kfree(user);
+
+	return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+	struct user_event *user;
+	u32 key = user_event_key(name);
+
+	*outkey = key;
+
+	hash_for_each_possible(register_table, user, node, key)
+		if (!strcmp(EVENT_NAME(user), name))
+			return user;
+
+	return NULL;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
+			      void *tpdata)
+{
+	struct trace_event_file *file;
+	struct trace_entry *entry;
+	struct trace_event_buffer event_buffer;
+
+	file = (struct trace_event_file *)tpdata;
+
+	if (!file ||
+	    !(file->flags & EVENT_FILE_FL_ENABLED) ||
+	    trace_trigger_soft_disabled(file))
+		return;
+
+	/* Allocates and fills trace_entry, + 1 of this is data payload */
+	entry = trace_event_buffer_reserve(&event_buffer, file,
+					   sizeof(*entry) + datalen);
+
+	if (unlikely(!entry))
+		return;
+
+	memcpy(entry + 1, data, datalen);
+
+	trace_event_buffer_commit(&event_buffer);
+}
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+	struct tracepoint *tp = &user->tracepoint;
+	char status = 0;
+
+	if (atomic_read(&tp->key.enabled) > 0) {
+		struct tracepoint_func *probe_func_ptr;
+		user_event_func_t probe_func;
+
+		rcu_read_lock_sched();
+
+		probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+		if (probe_func_ptr) {
+			do {
+				probe_func = probe_func_ptr->func;
+
+				if (probe_func == user_event_ftrace)
+					status |= EVENT_STATUS_FTRACE;
+				else
+					status |= EVENT_STATUS_OTHER;
+			} while ((++probe_func_ptr)->func);
+		}
+
+		rcu_read_unlock_sched();
+	}
+
+	register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+			  enum trace_reg type,
+			  void *data)
+{
+	struct user_event *user = (struct user_event *)call->data;
+	int ret = 0;
+
+	if (!user)
+		return -ENOENT;
+
+	switch (type) {
+	case TRACE_REG_REGISTER:
+		ret = tracepoint_probe_register(call->tp,
+						call->class->probe,
+						data);
+		if (!ret)
+			goto inc;
+		break;
+
+	case TRACE_REG_UNREGISTER:
+		tracepoint_probe_unregister(call->tp,
+					    call->class->probe,
+					    data);
+		goto dec;
+
+	default:
+		break;
+	}
+
+	return ret;
+inc:
+	atomic_inc(&user->refcnt);
+	update_reg_page_for(user);
+	return 0;
+dec:
+	update_reg_page_for(user);
+	atomic_dec(&user->refcnt);
+	return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+	struct user_event *user;
+	char *name;
+	int ret;
+
+	if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+		return -ECANCELED;
+
+	raw_command += USER_EVENTS_PREFIX_LEN;
+	raw_command = skip_spaces(raw_command);
+
+	name = kstrdup(raw_command, GFP_KERNEL);
+
+	if (!name)
+		return -ENOMEM;
+
+	mutex_lock(&reg_mutex);
+	ret = user_event_parse_cmd(name, &user);
+	mutex_unlock(&reg_mutex);
+
+	if (ret)
+		kfree(name);
+
+	return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+	struct ftrace_event_field *field, *next;
+	struct list_head *head;
+	int depth = 0;
+
+	seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+	head = trace_get_fields(&user->call);
+
+	list_for_each_entry_safe_reverse(field, next, head, link) {
+		if (depth == 0)
+			seq_puts(m, " ");
+		else
+			seq_puts(m, "; ");
+
+		seq_printf(m, "%s %s", field->type, field->name);
+
+		if (str_has_prefix(field->type, "struct "))
+			seq_printf(m, " %d", field->size);
+
+		depth++;
+	}
+
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	if (atomic_read(&user->refcnt) != 0)
+		return -EBUSY;
+
+	return destroy_user_event(user);
+}
+
+static bool user_event_match(const char *system, const char *event,
+			     int argc, const char **argv, struct dyn_event *ev)
+{
+	struct user_event *user = container_of(ev, struct user_event, devent);
+
+	return strcmp(EVENT_NAME(user), event) == 0 &&
+		(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+}
+
+static struct dyn_event_operations user_event_dops = {
+	.create = user_event_create,
+	.show = user_event_show,
+	.is_busy = user_event_is_busy,
+	.free = user_event_free,
+	.match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+	int ret;
+
+	ret = register_trace_event(&user->call.event);
+
+	if (!ret)
+		return -ENODEV;
+
+	ret = trace_add_event_call(&user->call);
+
+	if (ret)
+		unregister_trace_event(&user->call.event);
+
+	return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+			    struct user_event **newuser)
+{
+	int ret;
+	int index;
+	u32 key;
+	struct user_event *user = find_user_event(name, &key);
+
+	if (user) {
+		*newuser = user;
+		/*
+		 * Name is allocated by caller, free it since it already exists.
+		 * Caller only worries about failure cases for freeing.
+		 */
+		kfree(name);
+		return 0;
+	}
+
+	index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+	if (index == MAX_EVENTS)
+		return -EMFILE;
+
+	user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+	if (!user)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&user->class.fields);
+	INIT_LIST_HEAD(&user->fields);
+
+	user->tracepoint.name = name;
+
+	user_event_parse_flags(user, flags);
+
+	ret = user_event_parse_fields(user, args);
+
+	if (ret)
+		goto put_user;
+
+	/* Minimal print format */
+	user->call.print_fmt = "\"\"";
+
+	user->call.data = user;
+	user->call.class = &user->class;
+	user->call.name = name;
+	user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+	user->call.tp = &user->tracepoint;
+	user->call.event.funcs = &user_event_funcs;
+
+	user->class.system = USER_EVENTS_SYSTEM;
+	user->class.fields_array = user_event_fields_array;
+	user->class.get_fields = user_event_get_fields;
+	user->class.reg = user_event_reg;
+	user->class.probe = user_event_ftrace;
+
+	mutex_lock(&event_mutex);
+	ret = user_event_trace_register(user);
+	mutex_unlock(&event_mutex);
+
+	if (ret)
+		goto put_user;
+
+	user->index = index;
+	dyn_event_init(&user->devent, &user_event_dops);
+	dyn_event_add(&user->devent, &user->call);
+	set_bit(user->index, page_bitmap);
+	hash_add(register_table, &user->node, key);
+
+	*newuser = user;
+	return 0;
+put_user:
+	user_event_destroy_fields(user);
+	kfree(user);
+	return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+	u32 key;
+	int ret;
+	struct user_event *user = find_user_event(name, &key);
+
+	if (!user)
+		return -ENOENT;
+
+	if (atomic_read(&user->refcnt) != 0)
+		return -EBUSY;
+
+	mutex_lock(&event_mutex);
+	ret = destroy_user_event(user);
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+	struct user_event_refs *refs;
+	struct user_event *user = NULL;
+	struct tracepoint *tp;
+	ssize_t ret = i->count;
+	int idx;
+
+	if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+		return -EFAULT;
+
+	rcu_read_lock_sched();
+
+	refs = rcu_dereference_sched(file->private_data);
+
+	/*
+	 * The refs->events array is protected by RCU, and new items may be
+	 * added. But the user retrieved from indexing into the events array
+	 * shall be immutable while the file is opened.
+	 */
+	if (likely(refs && idx < refs->count))
+		user = refs->events[idx];
+
+	rcu_read_unlock_sched();
+
+	if (unlikely(user == NULL))
+		return -ENOENT;
+
+	tp = &user->tracepoint;
+
+	/*
+	 * It's possible key.enabled disables after this check, however
+	 * we don't mind if a few events are included in this condition.
+	 */
+	if (likely(atomic_read(&tp->key.enabled) > 0)) {
+		struct tracepoint_func *probe_func_ptr;
+		user_event_func_t probe_func;
+		void *tpdata;
+		void *kdata;
+		u32 datalen;
+
+		kdata = kmalloc(i->count, GFP_KERNEL);
+
+		if (unlikely(!kdata))
+			return -ENOMEM;
+
+		datalen = copy_from_iter(kdata, i->count, i);
+
+		rcu_read_lock_sched();
+
+		probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+		if (probe_func_ptr) {
+			do {
+				probe_func = probe_func_ptr->func;
+				tpdata = probe_func_ptr->data;
+				probe_func(user, kdata, datalen, tpdata);
+			} while ((++probe_func_ptr)->func);
+		}
+
+		rcu_read_unlock_sched();
+
+		kfree(kdata);
+	}
+
+	return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+				 size_t count, loff_t *ppos)
+{
+	struct iovec iov;
+	struct iov_iter i;
+
+	if (unlikely(*ppos != 0))
+		return -EFAULT;
+
+	if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+		return -EFAULT;
+
+	return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+	return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+	struct user_event_refs *refs, *new_refs;
+	int i, size, count = 0;
+
+	refs = rcu_dereference_protected(file->private_data,
+					 lockdep_is_held(&reg_mutex));
+
+	if (refs) {
+		count = refs->count;
+
+		for (i = 0; i < count; ++i)
+			if (refs->events[i] == user)
+				return i;
+	}
+
+	size = struct_size(refs, events, count + 1);
+
+	new_refs = kzalloc(size, GFP_KERNEL);
+
+	if (!new_refs)
+		return -ENOMEM;
+
+	new_refs->count = count + 1;
+
+	for (i = 0; i < count; ++i)
+		new_refs->events[i] = refs->events[i];
+
+	new_refs->events[i] = user;
+
+	atomic_inc(&user->refcnt);
+
+	rcu_assign_pointer(file->private_data, new_refs);
+
+	if (refs)
+		kfree_rcu(refs, rcu);
+
+	return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+	u32 size;
+	long ret;
+
+	ret = get_user(size, &ureg->size);
+
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+	struct user_reg reg;
+	struct user_event *user;
+	char *name;
+	long ret;
+
+	ret = user_reg_get(ureg, &reg);
+
+	if (ret)
+		return ret;
+
+	name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+			    MAX_EVENT_DESC);
+
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		return ret;
+	}
+
+	ret = user_event_parse_cmd(name, &user);
+
+	if (ret) {
+		kfree(name);
+		return ret;
+	}
+
+	ret = user_events_ref_add(file, user);
+
+	/* Positive number is index and valid */
+	if (ret < 0)
+		return ret;
+
+	put_user((u32)ret, &ureg->write_index);
+	put_user(user->index, &ureg->status_index);
+
+	return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+	void __user *ubuf = (void __user *)uarg;
+	char *name;
+	long ret;
+
+	name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	ret = delete_user_event(name);
+
+	kfree(name);
+
+	return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+			      unsigned long uarg)
+{
+	long ret = -ENOTTY;
+
+	switch (cmd) {
+	case DIAG_IOCSREG:
+		mutex_lock(&reg_mutex);
+		ret = user_events_ioctl_reg(file, uarg);
+		mutex_unlock(&reg_mutex);
+		break;
+
+	case DIAG_IOCSDEL:
+		mutex_lock(&reg_mutex);
+		ret = user_events_ioctl_del(file, uarg);
+		mutex_unlock(&reg_mutex);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+	struct user_event_refs *refs;
+	struct user_event *user;
+	int i;
+
+	/*
+	 * Ensure refs cannot change under any situation by taking the
+	 * register mutex during the final freeing of the references.
+	 */
+	mutex_lock(&reg_mutex);
+
+	refs = file->private_data;
+
+	if (!refs)
+		goto out;
+
+	/*
+	 * The lifetime of refs has reached an end, it's tied to this file.
+	 * The underlying user_events are ref counted, and cannot be freed.
+	 * After this decrement, the user_events may be freed elsewhere.
+	 */
+	for (i = 0; i < refs->count; ++i) {
+		user = refs->events[i];
+
+		if (user)
+			atomic_dec(&user->refcnt);
+	}
+out:
+	file->private_data = NULL;
+
+	mutex_unlock(&reg_mutex);
+
+	kfree(refs);
+
+	return 0;
+}
+
+static const struct file_operations user_data_fops = {
+	.write = user_events_write,
+	.write_iter = user_events_write_iter,
+	.unlocked_ioctl	= user_events_ioctl,
+	.release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+
+	if (size != MAX_EVENTS)
+		return -EINVAL;
+
+	return remap_pfn_range(vma, vma->vm_start,
+			       virt_to_phys(register_page_data) >> PAGE_SHIFT,
+			       size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos)
+		return NULL;
+
+	return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+	struct user_event *user;
+	char status;
+	int i, active = 0, busy = 0, flags;
+
+	mutex_lock(&reg_mutex);
+
+	hash_for_each(register_table, i, user, node) {
+		status = register_page_data[user->index];
+		flags = user->flags;
+
+		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+		if (flags != 0 || status != 0)
+			seq_puts(m, " #");
+
+		if (status != 0) {
+			seq_puts(m, " Used by");
+			if (status & EVENT_STATUS_FTRACE)
+				seq_puts(m, " ftrace");
+			if (status & EVENT_STATUS_PERF)
+				seq_puts(m, " perf");
+			if (status & EVENT_STATUS_OTHER)
+				seq_puts(m, " other");
+			busy++;
+		}
+
+		if (flags & FLAG_BPF_ITER)
+			seq_puts(m, " FLAG:BPF_ITER");
+
+		seq_puts(m, "\n");
+		active++;
+	}
+
+	mutex_unlock(&reg_mutex);
+
+	seq_puts(m, "\n");
+	seq_printf(m, "Active: %d\n", active);
+	seq_printf(m, "Busy: %d\n", busy);
+	seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+	return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+	.start = user_seq_start,
+	.next  = user_seq_next,
+	.stop  = user_seq_stop,
+	.show  = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+	return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+	.open = user_status_open,
+	.mmap = user_status_mmap,
+	.read = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+	struct dentry *edata, *emmap;
+
+	edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+				    NULL, NULL, &user_data_fops);
+
+	if (!edata) {
+		pr_warn("Could not create tracefs 'user_events_data' entry\n");
+		goto err;
+	}
+
+	/* mmap with MAP_SHARED requires writable fd */
+	emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+				    NULL, NULL, &user_status_fops);
+
+	if (!emmap) {
+		tracefs_remove(edata);
+		pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+		goto err;
+	}
+
+	return 0;
+err:
+	return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+	int page;
+
+	for (page = 0; page < MAX_PAGES; ++page) {
+		void *addr = register_page_data + (PAGE_SIZE * page);
+
+		if (set)
+			SetPageReserved(virt_to_page(addr));
+		else
+			ClearPageReserved(virt_to_page(addr));
+	}
+}
+
+static int __init trace_events_user_init(void)
+{
+	int ret;
+
+	/* Zero all bits beside 0 (which is reserved for failures) */
+	bitmap_zero(page_bitmap, MAX_EVENTS);
+	set_bit(0, page_bitmap);
+
+	register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL);
+
+	if (!register_page_data)
+		return -ENOMEM;
+
+	set_page_reservations(true);
+
+	ret = create_user_tracefs();
+
+	if (ret) {
+		pr_warn("user_events could not register with tracefs\n");
+		set_page_reservations(false);
+		kfree(register_page_data);
+		return ret;
+	}
+
+	if (dyn_event_register(&user_event_dops))
+		pr_warn("user_events could not register with dyn_events\n");
+
+	return 0;
+}
+
+fs_initcall(trace_events_user_init);
-- 
cgit v1.2.3


From aa3b2b4c669205200615dd8a2cc4af4f81fd0335 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:16 -0800
Subject: user_events: Add print_fmt generation support for basic types

Addes print_fmt format generation for basic types that are supported for
user processes. Only supports sizes that are the same on 32 and 64 bit.

Link: https://lkml.kernel.org/r/20220118204326.2169-3-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 115 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 113 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 77105233115e..ddc5c3cf1bf8 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -359,6 +359,114 @@ static int user_event_parse_fields(struct user_event *user, char *args)
 
 static struct trace_event_fields user_event_fields_array[1];
 
+static const char *user_field_format(const char *type)
+{
+	if (strcmp(type, "s64") == 0)
+		return "%lld";
+	if (strcmp(type, "u64") == 0)
+		return "%llu";
+	if (strcmp(type, "s32") == 0)
+		return "%d";
+	if (strcmp(type, "u32") == 0)
+		return "%u";
+	if (strcmp(type, "int") == 0)
+		return "%d";
+	if (strcmp(type, "unsigned int") == 0)
+		return "%u";
+	if (strcmp(type, "s16") == 0)
+		return "%d";
+	if (strcmp(type, "u16") == 0)
+		return "%u";
+	if (strcmp(type, "short") == 0)
+		return "%d";
+	if (strcmp(type, "unsigned short") == 0)
+		return "%u";
+	if (strcmp(type, "s8") == 0)
+		return "%d";
+	if (strcmp(type, "u8") == 0)
+		return "%u";
+	if (strcmp(type, "char") == 0)
+		return "%d";
+	if (strcmp(type, "unsigned char") == 0)
+		return "%u";
+	if (strstr(type, "char[") != 0)
+		return "%s";
+
+	/* Unknown, likely struct, allowed treat as 64-bit */
+	return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+	if (str_has_prefix(type, "__data_loc ")) {
+		*str_func = "__get_str";
+		goto check;
+	}
+
+	if (str_has_prefix(type, "__rel_loc ")) {
+		*str_func = "__get_rel_str";
+		goto check;
+	}
+
+	return false;
+check:
+	return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+	struct ftrace_event_field *field, *next;
+	struct list_head *head = &user->fields;
+	int pos = 0, depth = 0;
+	const char *str_func;
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+	list_for_each_entry_safe_reverse(field, next, head, link) {
+		if (depth != 0)
+			pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+				field->name, user_field_format(field->type));
+
+		depth++;
+	}
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+	list_for_each_entry_safe_reverse(field, next, head, link) {
+		if (user_field_is_dyn_string(field->type, &str_func))
+			pos += snprintf(buf + pos, LEN_OR_ZERO,
+					", %s(%s)", str_func, field->name);
+		else
+			pos += snprintf(buf + pos, LEN_OR_ZERO,
+					", REC->%s", field->name);
+	}
+
+	return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+	char *print_fmt;
+	int len;
+
+	len = user_event_set_print_fmt(user, NULL, 0);
+
+	print_fmt = kmalloc(len, GFP_KERNEL);
+
+	if (!print_fmt)
+		return -ENOMEM;
+
+	user_event_set_print_fmt(user, print_fmt, len);
+
+	user->call.print_fmt = print_fmt;
+
+	return 0;
+}
+
 static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
 						int flags,
 						struct trace_event *event)
@@ -392,6 +500,7 @@ static int destroy_user_event(struct user_event *user)
 	clear_bit(user->index, page_bitmap);
 	hash_del(&user->node);
 
+	kfree(user->call.print_fmt);
 	kfree(EVENT_NAME(user));
 	kfree(user);
 
@@ -669,8 +778,10 @@ static int user_event_parse(char *name, char *args, char *flags,
 	if (ret)
 		goto put_user;
 
-	/* Minimal print format */
-	user->call.print_fmt = "\"\"";
+	ret = user_event_create_print_fmt(user);
+
+	if (ret)
+		goto put_user;
 
 	user->call.data = user;
 	user->call.class = &user->class;
-- 
cgit v1.2.3


From 9aed4e157d1ffe4aeebc005b4eceede1ed5a403a Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:17 -0800
Subject: user_events: Handle matching arguments from dyn_events

Ensures that when dynamic events requests a match with arguments that
they match what is in the user_event.

Link: https://lkml.kernel.org/r/20220118204326.2169-4-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 77 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index ddc5c3cf1bf8..a6794cb1f586 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -39,6 +39,7 @@
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
 
 static char *register_page_data;
 
@@ -700,13 +701,87 @@ static int user_event_free(struct dyn_event *ev)
 	return destroy_user_event(user);
 }
 
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+			     const char **argv, int *iout)
+{
+	char *field_name, *arg_name;
+	int len, pos, i = *iout;
+	bool colon = false, match = false;
+
+	if (i >= argc)
+		return false;
+
+	len = MAX_FIELD_ARG_NAME;
+	field_name = kmalloc(len, GFP_KERNEL);
+	arg_name = kmalloc(len, GFP_KERNEL);
+
+	if (!arg_name || !field_name)
+		goto out;
+
+	pos = 0;
+
+	for (; i < argc; ++i) {
+		if (i != *iout)
+			pos += snprintf(arg_name + pos, len - pos, " ");
+
+		pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+		if (strchr(argv[i], ';')) {
+			++i;
+			colon = true;
+			break;
+		}
+	}
+
+	pos = 0;
+
+	pos += snprintf(field_name + pos, len - pos, field->type);
+	pos += snprintf(field_name + pos, len - pos, " ");
+	pos += snprintf(field_name + pos, len - pos, field->name);
+
+	if (colon)
+		pos += snprintf(field_name + pos, len - pos, ";");
+
+	*iout = i;
+
+	match = strcmp(arg_name, field_name) == 0;
+out:
+	kfree(arg_name);
+	kfree(field_name);
+
+	return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+			      const char **argv)
+{
+	struct ftrace_event_field *field, *next;
+	struct list_head *head = &user->fields;
+	int i = 0;
+
+	list_for_each_entry_safe_reverse(field, next, head, link)
+		if (!user_field_match(field, argc, argv, &i))
+			return false;
+
+	if (i != argc)
+		return false;
+
+	return true;
+}
+
 static bool user_event_match(const char *system, const char *event,
 			     int argc, const char **argv, struct dyn_event *ev)
 {
 	struct user_event *user = container_of(ev, struct user_event, devent);
+	bool match;
 
-	return strcmp(EVENT_NAME(user), event) == 0 &&
+	match = strcmp(EVENT_NAME(user), event) == 0 &&
 		(!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+	if (match && argc > 0)
+		match = user_fields_match(user, argc, argv);
+
+	return match;
 }
 
 static struct dyn_event_operations user_event_dops = {
-- 
cgit v1.2.3


From 3207d0459ef3789c7efa801b57123c8a79d05694 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:18 -0800
Subject: user_events: Add basic perf and eBPF support

Adds support to write out user_event data to perf_probe/perf files as
well as to any attached eBPF program.

Link: https://lkml.kernel.org/r/20220118204326.2169-5-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 72 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index a6794cb1f586..371f31472156 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -551,6 +551,50 @@ static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
 	trace_event_buffer_commit(&event_buffer);
 }
 
+#ifdef CONFIG_PERF_EVENTS
+/*
+ * Writes the user supplied payload out to perf ring buffer or eBPF program.
+ */
+static void user_event_perf(struct user_event *user, void *data, u32 datalen,
+			    void *tpdata)
+{
+	struct hlist_head *perf_head;
+
+	if (bpf_prog_array_valid(&user->call)) {
+		struct user_bpf_context context = {0};
+
+		context.data_len = datalen;
+		context.data_type = USER_BPF_DATA_KERNEL;
+		context.kdata = data;
+
+		trace_call_bpf(&user->call, &context);
+	}
+
+	perf_head = this_cpu_ptr(user->call.perf_events);
+
+	if (perf_head && !hlist_empty(perf_head)) {
+		struct trace_entry *perf_entry;
+		struct pt_regs *regs;
+		size_t size = sizeof(*perf_entry) + datalen;
+		int context;
+
+		perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+						  &regs, &context);
+
+		if (unlikely(!perf_entry))
+			return;
+
+		perf_fetch_caller_regs(regs);
+
+		memcpy(perf_entry + 1, data, datalen);
+
+		perf_trace_buf_submit(perf_entry, size, context,
+				      user->call.event.type, 1, regs,
+				      perf_head, NULL);
+	}
+}
+#endif
+
 /*
  * Update the register page that is shared between user processes.
  */
@@ -573,6 +617,10 @@ static void update_reg_page_for(struct user_event *user)
 
 				if (probe_func == user_event_ftrace)
 					status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+				else if (probe_func == user_event_perf)
+					status |= EVENT_STATUS_PERF;
+#endif
 				else
 					status |= EVENT_STATUS_OTHER;
 			} while ((++probe_func_ptr)->func);
@@ -612,8 +660,27 @@ static int user_event_reg(struct trace_event_call *call,
 					    data);
 		goto dec;
 
-	default:
+#ifdef CONFIG_PERF_EVENTS
+	case TRACE_REG_PERF_REGISTER:
+		ret = tracepoint_probe_register(call->tp,
+						call->class->perf_probe,
+						data);
+		if (!ret)
+			goto inc;
+		break;
+
+	case TRACE_REG_PERF_UNREGISTER:
+		tracepoint_probe_unregister(call->tp,
+					    call->class->perf_probe,
+					    data);
+		goto dec;
+
+	case TRACE_REG_PERF_OPEN:
+	case TRACE_REG_PERF_CLOSE:
+	case TRACE_REG_PERF_ADD:
+	case TRACE_REG_PERF_DEL:
 		break;
+#endif
 	}
 
 	return ret;
@@ -870,6 +937,9 @@ static int user_event_parse(char *name, char *args, char *flags,
 	user->class.get_fields = user_event_get_fields;
 	user->class.reg = user_event_reg;
 	user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+	user->class.perf_probe = user_event_perf;
+#endif
 
 	mutex_lock(&event_mutex);
 	ret = user_event_trace_register(user);
-- 
cgit v1.2.3


From 0279400ad38d858ed68f5d787385f6122d4170b2 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:19 -0800
Subject: user_events: Optimize writing events by only copying data once

Pass iterator through to probes to allow copying data directly to the
probe buffers instead of taking multiple copies. Enables eBPF user and
raw iterator types out to programs for no-copy scenarios.

Link: https://lkml.kernel.org/r/20220118204326.2169-6-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 115 +++++++++++++++++++++++++++++----------
 1 file changed, 85 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 371f31472156..78b6b96c4cfa 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -41,6 +41,9 @@
 #define MAX_FIELD_ARRAY_SIZE 1024
 #define MAX_FIELD_ARG_NAME 256
 
+#define MAX_BPF_COPY_SIZE PAGE_SIZE
+#define MAX_STACK_BPF_DATA 512
+
 static char *register_page_data;
 
 static DEFINE_MUTEX(reg_mutex);
@@ -78,8 +81,7 @@ struct user_event_refs {
 	struct user_event *events[];
 };
 
-typedef void (*user_event_func_t) (struct user_event *user,
-				   void *data, u32 datalen,
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
 				   void *tpdata);
 
 static int user_event_parse(char *name, char *args, char *flags,
@@ -90,6 +92,20 @@ static u32 user_event_key(char *name)
 	return jhash(name, strlen(name), 0);
 }
 
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+	size_t ret;
+
+	pagefault_disable();
+
+	ret = copy_from_iter_nocache(addr, bytes, i);
+
+	pagefault_enable();
+
+	return ret;
+}
+
 static struct list_head *user_event_get_fields(struct trace_event_call *call)
 {
 	struct user_event *user = (struct user_event *)call->data;
@@ -525,7 +541,7 @@ static struct user_event *find_user_event(char *name, u32 *outkey)
 /*
  * Writes the user supplied payload out to a trace file.
  */
-static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
 			      void *tpdata)
 {
 	struct trace_event_file *file;
@@ -541,41 +557,83 @@ static void user_event_ftrace(struct user_event *user, void *data, u32 datalen,
 
 	/* Allocates and fills trace_entry, + 1 of this is data payload */
 	entry = trace_event_buffer_reserve(&event_buffer, file,
-					   sizeof(*entry) + datalen);
+					   sizeof(*entry) + i->count);
 
 	if (unlikely(!entry))
 		return;
 
-	memcpy(entry + 1, data, datalen);
-
-	trace_event_buffer_commit(&event_buffer);
+	if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+		__trace_event_discard_commit(event_buffer.buffer,
+					     event_buffer.event);
+	else
+		trace_event_buffer_commit(&event_buffer);
 }
 
 #ifdef CONFIG_PERF_EVENTS
+static void user_event_bpf(struct user_event *user, struct iov_iter *i)
+{
+	struct user_bpf_context context;
+	struct user_bpf_iter bpf_i;
+	char fast_data[MAX_STACK_BPF_DATA];
+	void *temp = NULL;
+
+	if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
+		/* Raw iterator */
+		context.data_type = USER_BPF_DATA_ITER;
+		context.data_len = i->count;
+		context.iter = &bpf_i;
+
+		bpf_i.iov_offset = i->iov_offset;
+		bpf_i.iov = i->iov;
+		bpf_i.nr_segs = i->nr_segs;
+	} else if (i->nr_segs == 1 && iter_is_iovec(i)) {
+		/* Single buffer from user */
+		context.data_type = USER_BPF_DATA_USER;
+		context.data_len = i->count;
+		context.udata = i->iov->iov_base + i->iov_offset;
+	} else {
+		/* Multi buffer from user */
+		struct iov_iter copy = *i;
+		size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
+
+		context.data_type = USER_BPF_DATA_KERNEL;
+		context.kdata = fast_data;
+
+		if (unlikely(copy_size > sizeof(fast_data))) {
+			temp = kmalloc(copy_size, GFP_NOWAIT);
+
+			if (temp)
+				context.kdata = temp;
+			else
+				copy_size = sizeof(fast_data);
+		}
+
+		context.data_len = copy_nofault(context.kdata,
+						copy_size, &copy);
+	}
+
+	trace_call_bpf(&user->call, &context);
+
+	kfree(temp);
+}
+
 /*
  * Writes the user supplied payload out to perf ring buffer or eBPF program.
  */
-static void user_event_perf(struct user_event *user, void *data, u32 datalen,
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
 			    void *tpdata)
 {
 	struct hlist_head *perf_head;
 
-	if (bpf_prog_array_valid(&user->call)) {
-		struct user_bpf_context context = {0};
-
-		context.data_len = datalen;
-		context.data_type = USER_BPF_DATA_KERNEL;
-		context.kdata = data;
-
-		trace_call_bpf(&user->call, &context);
-	}
+	if (bpf_prog_array_valid(&user->call))
+		user_event_bpf(user, i);
 
 	perf_head = this_cpu_ptr(user->call.perf_events);
 
 	if (perf_head && !hlist_empty(perf_head)) {
 		struct trace_entry *perf_entry;
 		struct pt_regs *regs;
-		size_t size = sizeof(*perf_entry) + datalen;
+		size_t size = sizeof(*perf_entry) + i->count;
 		int context;
 
 		perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
@@ -586,7 +644,10 @@ static void user_event_perf(struct user_event *user, void *data, u32 datalen,
 
 		perf_fetch_caller_regs(regs);
 
-		memcpy(perf_entry + 1, data, datalen);
+		if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) {
+			perf_swevent_put_recursion_context(context);
+			return;
+		}
 
 		perf_trace_buf_submit(perf_entry, size, context,
 				      user->call.event.type, 1, regs,
@@ -1024,16 +1085,11 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 	if (likely(atomic_read(&tp->key.enabled) > 0)) {
 		struct tracepoint_func *probe_func_ptr;
 		user_event_func_t probe_func;
+		struct iov_iter copy;
 		void *tpdata;
-		void *kdata;
-		u32 datalen;
 
-		kdata = kmalloc(i->count, GFP_KERNEL);
-
-		if (unlikely(!kdata))
-			return -ENOMEM;
-
-		datalen = copy_from_iter(kdata, i->count, i);
+		if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+			return -EFAULT;
 
 		rcu_read_lock_sched();
 
@@ -1041,15 +1097,14 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 
 		if (probe_func_ptr) {
 			do {
+				copy = *i;
 				probe_func = probe_func_ptr->func;
 				tpdata = probe_func_ptr->data;
-				probe_func(user, kdata, datalen, tpdata);
+				probe_func(user, &copy, tpdata);
 			} while ((++probe_func_ptr)->func);
 		}
 
 		rcu_read_unlock_sched();
-
-		kfree(kdata);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 2467cda1b5c97a58776a8aebfa5d76543e47479d Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 18 Jan 2022 12:43:20 -0800
Subject: user_events: Validate user payloads for size and null termination

Add validation to ensure data is at or greater than the min size for the
fields of the event. If a dynamic array is used and is a type of char,
ensure null termination of the array exists.

Link: https://lkml.kernel.org/r/20220118204326.2169-7-beaub@linux.microsoft.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 147 +++++++++++++++++++++++++++++++++++----
 1 file changed, 133 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 78b6b96c4cfa..2b5e9fdb63a0 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -64,9 +64,11 @@ struct user_event {
 	struct dyn_event devent;
 	struct hlist_node node;
 	struct list_head fields;
+	struct list_head validators;
 	atomic_t refcnt;
 	int index;
 	int flags;
+	int min_size;
 };
 
 /*
@@ -81,8 +83,17 @@ struct user_event_refs {
 	struct user_event *events[];
 };
 
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+	struct list_head link;
+	int offset;
+	int flags;
+};
+
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
-				   void *tpdata);
+				   void *tpdata, bool *faulted);
 
 static int user_event_parse(char *name, char *args, char *flags,
 			    struct user_event **newuser);
@@ -215,6 +226,17 @@ static int user_field_size(const char *type)
 	return -EINVAL;
 }
 
+static void user_event_destroy_validators(struct user_event *user)
+{
+	struct user_event_validator *validator, *next;
+	struct list_head *head = &user->validators;
+
+	list_for_each_entry_safe(validator, next, head, link) {
+		list_del(&validator->link);
+		kfree(validator);
+	}
+}
+
 static void user_event_destroy_fields(struct user_event *user)
 {
 	struct ftrace_event_field *field, *next;
@@ -230,13 +252,43 @@ static int user_event_add_field(struct user_event *user, const char *type,
 				const char *name, int offset, int size,
 				int is_signed, int filter_type)
 {
+	struct user_event_validator *validator;
 	struct ftrace_event_field *field;
+	int validator_flags = 0;
 
 	field = kmalloc(sizeof(*field), GFP_KERNEL);
 
 	if (!field)
 		return -ENOMEM;
 
+	if (str_has_prefix(type, "__data_loc "))
+		goto add_validator;
+
+	if (str_has_prefix(type, "__rel_loc ")) {
+		validator_flags |= VALIDATOR_REL;
+		goto add_validator;
+	}
+
+	goto add_field;
+
+add_validator:
+	if (strstr(type, "char") != 0)
+		validator_flags |= VALIDATOR_ENSURE_NULL;
+
+	validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+	if (!validator) {
+		kfree(field);
+		return -ENOMEM;
+	}
+
+	validator->flags = validator_flags;
+	validator->offset = offset;
+
+	/* Want sequential access when validating */
+	list_add_tail(&validator->link, &user->validators);
+
+add_field:
 	field->type = type;
 	field->name = name;
 	field->offset = offset;
@@ -246,6 +298,12 @@ static int user_event_add_field(struct user_event *user, const char *type,
 
 	list_add(&field->link, &user->fields);
 
+	/*
+	 * Min size from user writes that are required, this does not include
+	 * the size of trace_entry (common fields).
+	 */
+	user->min_size = (offset + size) - sizeof(struct trace_entry);
+
 	return 0;
 }
 
@@ -517,6 +575,7 @@ static int destroy_user_event(struct user_event *user)
 	clear_bit(user->index, page_bitmap);
 	hash_del(&user->node);
 
+	user_event_destroy_validators(user);
 	kfree(user->call.print_fmt);
 	kfree(EVENT_NAME(user));
 	kfree(user);
@@ -538,15 +597,49 @@ static struct user_event *find_user_event(char *name, u32 *outkey)
 	return NULL;
 }
 
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+	struct list_head *head = &user->validators;
+	struct user_event_validator *validator;
+	void *pos, *end = data + len;
+	u32 loc, offset, size;
+
+	list_for_each_entry(validator, head, link) {
+		pos = data + validator->offset;
+
+		/* Already done min_size check, no bounds check here */
+		loc = *(u32 *)pos;
+		offset = loc & 0xffff;
+		size = loc >> 16;
+
+		if (likely(validator->flags & VALIDATOR_REL))
+			pos += offset + sizeof(loc);
+		else
+			pos = data + offset;
+
+		pos += size;
+
+		if (unlikely(pos > end))
+			return -EFAULT;
+
+		if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+			if (unlikely(*(char *)(pos - 1) != '\0'))
+				return -EFAULT;
+	}
+
+	return 0;
+}
+
 /*
  * Writes the user supplied payload out to a trace file.
  */
 static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
-			      void *tpdata)
+			      void *tpdata, bool *faulted)
 {
 	struct trace_event_file *file;
 	struct trace_entry *entry;
 	struct trace_event_buffer event_buffer;
+	size_t size = sizeof(*entry) + i->count;
 
 	file = (struct trace_event_file *)tpdata;
 
@@ -556,17 +649,25 @@ static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
 		return;
 
 	/* Allocates and fills trace_entry, + 1 of this is data payload */
-	entry = trace_event_buffer_reserve(&event_buffer, file,
-					   sizeof(*entry) + i->count);
+	entry = trace_event_buffer_reserve(&event_buffer, file, size);
 
 	if (unlikely(!entry))
 		return;
 
 	if (unlikely(!copy_nofault(entry + 1, i->count, i)))
-		__trace_event_discard_commit(event_buffer.buffer,
-					     event_buffer.event);
-	else
-		trace_event_buffer_commit(&event_buffer);
+		goto discard;
+
+	if (!list_empty(&user->validators) &&
+	    unlikely(user_event_validate(user, entry, size)))
+		goto discard;
+
+	trace_event_buffer_commit(&event_buffer);
+
+	return;
+discard:
+	*faulted = true;
+	__trace_event_discard_commit(event_buffer.buffer,
+				     event_buffer.event);
 }
 
 #ifdef CONFIG_PERF_EVENTS
@@ -621,7 +722,7 @@ static void user_event_bpf(struct user_event *user, struct iov_iter *i)
  * Writes the user supplied payload out to perf ring buffer or eBPF program.
  */
 static void user_event_perf(struct user_event *user, struct iov_iter *i,
-			    void *tpdata)
+			    void *tpdata, bool *faulted)
 {
 	struct hlist_head *perf_head;
 
@@ -644,14 +745,21 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i,
 
 		perf_fetch_caller_regs(regs);
 
-		if (unlikely(!copy_nofault(perf_entry + 1, i->count, i))) {
-			perf_swevent_put_recursion_context(context);
-			return;
-		}
+		if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+			goto discard;
+
+		if (!list_empty(&user->validators) &&
+		    unlikely(user_event_validate(user, perf_entry, size)))
+			goto discard;
 
 		perf_trace_buf_submit(perf_entry, size, context,
 				      user->call.event.type, 1, regs,
 				      perf_head, NULL);
+
+		return;
+discard:
+		*faulted = true;
+		perf_swevent_put_recursion_context(context);
 	}
 }
 #endif
@@ -971,6 +1079,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 
 	INIT_LIST_HEAD(&user->class.fields);
 	INIT_LIST_HEAD(&user->fields);
+	INIT_LIST_HEAD(&user->validators);
 
 	user->tracepoint.name = name;
 
@@ -1019,6 +1128,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 	return 0;
 put_user:
 	user_event_destroy_fields(user);
+	user_event_destroy_validators(user);
 	kfree(user);
 	return ret;
 }
@@ -1076,6 +1186,9 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 	if (unlikely(user == NULL))
 		return -ENOENT;
 
+	if (unlikely(i->count < user->min_size))
+		return -EINVAL;
+
 	tp = &user->tracepoint;
 
 	/*
@@ -1087,10 +1200,13 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 		user_event_func_t probe_func;
 		struct iov_iter copy;
 		void *tpdata;
+		bool faulted;
 
 		if (unlikely(fault_in_iov_iter_readable(i, i->count)))
 			return -EFAULT;
 
+		faulted = false;
+
 		rcu_read_lock_sched();
 
 		probe_func_ptr = rcu_dereference_sched(tp->funcs);
@@ -1100,11 +1216,14 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 				copy = *i;
 				probe_func = probe_func_ptr->func;
 				tpdata = probe_func_ptr->data;
-				probe_func(user, &copy, tpdata);
+				probe_func(user, &copy, tpdata, &faulted);
 			} while ((++probe_func_ptr)->func);
 		}
 
 		rcu_read_unlock_sched();
+
+		if (unlikely(faulted))
+			return -EFAULT;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From c441e934b604a3b5f350a9104124cf6a3ba07a34 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Sat, 15 Jan 2022 15:16:57 -0800
Subject: locking: Add missing __sched attributes

This patch adds __sched attributes to a few missing places
to show blocked function rather than locking function
in get_wchan.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220115231657.84828-1-minchan@kernel.org
---
 kernel/locking/percpu-rwsem.c | 5 +++--
 kernel/locking/rwsem.c        | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..c9fdae94e098 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/debug.h>
 #include <linux/errno.h>
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
@@ -162,7 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
 	__set_current_state(TASK_RUNNING);
 }
 
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 {
 	if (__percpu_down_read_trylock(sem))
 		return true;
@@ -211,7 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
 	return true;
 }
 
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 69aba4abe104..acde5d6f1254 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1048,7 +1048,7 @@ out_nolock:
 /*
  * Wait until we successfully acquire the write lock
  */
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
 	long count;
-- 
cgit v1.2.3


From 2cfb7a1b031b0e816af7a6ee0c6ab83b0acdf05a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 8 Feb 2022 09:43:33 +0000
Subject: sched/fair: Improve consistency of allowed NUMA balance calculations

There are inconsistencies when determining if a NUMA imbalance is allowed
that should be corrected.

o allow_numa_imbalance changes types and is not always examining
  the destination group so both the type should be corrected as
  well as the naming.
o find_idlest_group uses the sched_domain's weight instead of the
  group weight which is different to find_busiest_group
o find_busiest_group uses the source group instead of the destination
  which is different to task_numa_find_cpu
o Both find_idlest_group and find_busiest_group should account
  for the number of running tasks if a move was allowed to be
  consistent with task_numa_find_cpu

Fixes: 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA nodes")
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-2-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5dca13ff89f2..ea710168ae91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9005,9 +9005,10 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool
+allow_numa_imbalance(unsigned int running, unsigned int weight)
 {
-	return (dst_running < (dst_weight >> 2));
+	return (running < (weight >> 2));
 }
 
 /*
@@ -9141,12 +9142,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 				return idlest;
 #endif
 			/*
-			 * Otherwise, keep the task on this node to stay close
-			 * its wakeup source and improve locality. If there is
-			 * a real need of migration, periodic load balance will
-			 * take care of it.
+			 * Otherwise, keep the task close to the wakeup source
+			 * and improve locality if the number of running tasks
+			 * would remain below threshold where an imbalance is
+			 * allowed. If there is a real need of migration,
+			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
 				return NULL;
 		}
 
@@ -9352,7 +9354,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				busiest->sum_nr_running, busiest->group_weight);
+				local->sum_nr_running + 1, local->group_weight);
 		}
 
 		return;
-- 
cgit v1.2.3


From e496132ebedd870b67f1f6d2428f9bb9d7ae27fd Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 8 Feb 2022 09:43:34 +0000
Subject: sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans
 multiple LLCs

Commit 7d2b5dd0bcc4 ("sched/numa: Allow a floating imbalance between NUMA
nodes") allowed an imbalance between NUMA nodes such that communicating
tasks would not be pulled apart by the load balancer. This works fine when
there is a 1:1 relationship between LLC and node but can be suboptimal
for multiple LLCs if independent tasks prematurely use CPUs sharing cache.

Zen* has multiple LLCs per node with local memory channels and due to
the allowed imbalance, it's far harder to tune some workloads to run
optimally than it is on hardware that has 1 LLC per node. This patch
allows an imbalance to exist up to the point where LLCs should be balanced
between nodes.

On a Zen3 machine running STREAM parallelised with OMP to have on instance
per LLC the results and without binding, the results are

                            5.17.0-rc0             5.17.0-rc0
                               vanilla       sched-numaimb-v6
MB/sec copy-16    162596.94 (   0.00%)   580559.74 ( 257.05%)
MB/sec scale-16   136901.28 (   0.00%)   374450.52 ( 173.52%)
MB/sec add-16     157300.70 (   0.00%)   564113.76 ( 258.62%)
MB/sec triad-16   151446.88 (   0.00%)   564304.24 ( 272.61%)

STREAM can use directives to force the spread if the OpenMP is new
enough but that doesn't help if an application uses threads and
it's not known in advance how many threads will be created.

Coremark is a CPU and cache intensive benchmark parallelised with
threads. When running with 1 thread per core, the vanilla kernel
allows threads to contend on cache. With the patch;

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v5
Min       Score-16   368239.36 (   0.00%)   389816.06 (   5.86%)
Hmean     Score-16   388607.33 (   0.00%)   427877.08 *  10.11%*
Max       Score-16   408945.69 (   0.00%)   481022.17 (  17.62%)
Stddev    Score-16    15247.04 (   0.00%)    24966.82 ( -63.75%)
CoeffVar  Score-16        3.92 (   0.00%)        5.82 ( -48.48%)

It can also make a big difference for semi-realistic workloads
like specjbb which can execute arbitrary numbers of threads without
advance knowledge of how they should be placed. Even in cases where
the average performance is neutral, the results are more stable.

                               5.17.0-rc0             5.17.0-rc0
                                  vanilla       sched-numaimb-v6
Hmean     tput-1      71631.55 (   0.00%)    73065.57 (   2.00%)
Hmean     tput-8     582758.78 (   0.00%)   556777.23 (  -4.46%)
Hmean     tput-16   1020372.75 (   0.00%)  1009995.26 (  -1.02%)
Hmean     tput-24   1416430.67 (   0.00%)  1398700.11 (  -1.25%)
Hmean     tput-32   1687702.72 (   0.00%)  1671357.04 (  -0.97%)
Hmean     tput-40   1798094.90 (   0.00%)  2015616.46 *  12.10%*
Hmean     tput-48   1972731.77 (   0.00%)  2333233.72 (  18.27%)
Hmean     tput-56   2386872.38 (   0.00%)  2759483.38 (  15.61%)
Hmean     tput-64   2909475.33 (   0.00%)  2925074.69 (   0.54%)
Hmean     tput-72   2585071.36 (   0.00%)  2962443.97 (  14.60%)
Hmean     tput-80   2994387.24 (   0.00%)  3015980.59 (   0.72%)
Hmean     tput-88   3061408.57 (   0.00%)  3010296.16 (  -1.67%)
Hmean     tput-96   3052394.82 (   0.00%)  2784743.41 (  -8.77%)
Hmean     tput-104  2997814.76 (   0.00%)  2758184.50 (  -7.99%)
Hmean     tput-112  2955353.29 (   0.00%)  2859705.09 (  -3.24%)
Hmean     tput-120  2889770.71 (   0.00%)  2764478.46 (  -4.34%)
Hmean     tput-128  2871713.84 (   0.00%)  2750136.73 (  -4.23%)
Stddev    tput-1       5325.93 (   0.00%)     2002.53 (  62.40%)
Stddev    tput-8       6630.54 (   0.00%)    10905.00 ( -64.47%)
Stddev    tput-16     25608.58 (   0.00%)     6851.16 (  73.25%)
Stddev    tput-24     12117.69 (   0.00%)     4227.79 (  65.11%)
Stddev    tput-32     27577.16 (   0.00%)     8761.05 (  68.23%)
Stddev    tput-40     59505.86 (   0.00%)     2048.49 (  96.56%)
Stddev    tput-48    168330.30 (   0.00%)    93058.08 (  44.72%)
Stddev    tput-56    219540.39 (   0.00%)    30687.02 (  86.02%)
Stddev    tput-64    121750.35 (   0.00%)     9617.36 (  92.10%)
Stddev    tput-72    223387.05 (   0.00%)    34081.13 (  84.74%)
Stddev    tput-80    128198.46 (   0.00%)    22565.19 (  82.40%)
Stddev    tput-88    136665.36 (   0.00%)    27905.97 (  79.58%)
Stddev    tput-96    111925.81 (   0.00%)    99615.79 (  11.00%)
Stddev    tput-104   146455.96 (   0.00%)    28861.98 (  80.29%)
Stddev    tput-112    88740.49 (   0.00%)    58288.23 (  34.32%)
Stddev    tput-120   186384.86 (   0.00%)    45812.03 (  75.42%)
Stddev    tput-128    78761.09 (   0.00%)    57418.48 (  27.10%)

Similarly, for embarassingly parallel problems like NPB-ep, there are
improvements due to better spreading across LLC when the machine is not
fully utilised.

                              vanilla       sched-numaimb-v6
Min       ep.D       31.79 (   0.00%)       26.11 (  17.87%)
Amean     ep.D       31.86 (   0.00%)       26.17 *  17.86%*
Stddev    ep.D        0.07 (   0.00%)        0.05 (  24.41%)
CoeffVar  ep.D        0.22 (   0.00%)        0.20 (   7.97%)
Max       ep.D       31.93 (   0.00%)       26.21 (  17.91%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20220208094334.16379-3-mgorman@techsingularity.net
---
 kernel/sched/fair.c     | 22 ++++++++++----------
 kernel/sched/topology.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea710168ae91..5c4bfffe8c2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1489,6 +1489,7 @@ struct task_numa_env {
 
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
+	int imb_numa_nr;
 
 	struct numa_stats src_stats, dst_stats;
 
@@ -1503,7 +1504,7 @@ struct task_numa_env {
 static unsigned long cpu_load(struct rq *rq);
 static unsigned long cpu_runnable(struct rq *rq);
 static inline long adjust_numa_imbalance(int imbalance,
-					int dst_running, int dst_weight);
+					int dst_running, int imb_numa_nr);
 
 static inline enum
 numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1885,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 		dst_running = env->dst_stats.nr_running + 1;
 		imbalance = max(0, dst_running - src_running);
 		imbalance = adjust_numa_imbalance(imbalance, dst_running,
-							env->dst_stats.weight);
+						  env->imb_numa_nr);
 
 		/* Use idle CPU if there is no imbalance */
 		if (!imbalance) {
@@ -1949,8 +1950,10 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-	if (sd)
+	if (sd) {
 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+		env.imb_numa_nr = sd->imb_numa_nr;
+	}
 	rcu_read_unlock();
 
 	/*
@@ -9005,10 +9008,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
  * This is an approximation as the number of running tasks may not be
  * related to the number of busy CPUs due to sched_setaffinity.
  */
-static inline bool
-allow_numa_imbalance(unsigned int running, unsigned int weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
 {
-	return (running < (weight >> 2));
+	return running <= imb_numa_nr;
 }
 
 /*
@@ -9148,7 +9150,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 			 * allowed. If there is a real need of migration,
 			 * periodic load balance will take care of it.
 			 */
-			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
+			if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
 				return NULL;
 		}
 
@@ -9240,9 +9242,9 @@ next_group:
 #define NUMA_IMBALANCE_MIN 2
 
 static inline long adjust_numa_imbalance(int imbalance,
-				int dst_running, int dst_weight)
+				int dst_running, int imb_numa_nr)
 {
-	if (!allow_numa_imbalance(dst_running, dst_weight))
+	if (!allow_numa_imbalance(dst_running, imb_numa_nr))
 		return imbalance;
 
 	/*
@@ -9354,7 +9356,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		/* Consider allowing a small imbalance between NUMA groups */
 		if (env->sd->flags & SD_NUMA) {
 			env->imbalance = adjust_numa_imbalance(env->imbalance,
-				local->sum_nr_running + 1, local->group_weight);
+				local->sum_nr_running + 1, env->sd->imb_numa_nr);
 		}
 
 		return;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..e6cd55951304 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2242,6 +2242,59 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Calculate an allowed NUMA imbalance such that LLCs do not get
+	 * imbalanced.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned int imb = 0;
+		unsigned int imb_span = 1;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			struct sched_domain *child = sd->child;
+
+			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
+				struct sched_domain *top, *top_p;
+				unsigned int nr_llcs;
+
+				/*
+				 * For a single LLC per node, allow an
+				 * imbalance up to 25% of the node. This is an
+				 * arbitrary cutoff based on SMT-2 to balance
+				 * between memory bandwidth and avoiding
+				 * premature sharing of HT resources and SMT-4
+				 * or SMT-8 *may* benefit from a different
+				 * cutoff.
+				 *
+				 * For multiple LLCs, allow an imbalance
+				 * until multiple tasks would share an LLC
+				 * on one node while LLCs on another node
+				 * remain idle.
+				 */
+				nr_llcs = sd->span_weight / child->span_weight;
+				if (nr_llcs == 1)
+					imb = sd->span_weight >> 2;
+				else
+					imb = nr_llcs;
+				sd->imb_numa_nr = imb;
+
+				/* Set span based on the first NUMA domain. */
+				top = sd;
+				top_p = top->parent;
+				while (top_p && !(top_p->flags & SD_NUMA)) {
+					top = top->parent;
+					top_p = top->parent;
+				}
+				imb_span = top_p ? top_p->span_weight : sd->span_weight;
+			} else {
+				int factor = max(1U, (sd->span_weight / imb_span));
+
+				sd->imb_numa_nr = imb * factor;
+			}
+		}
+	}
+
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
-- 
cgit v1.2.3


From 00ba9357d18947859b7ef03a82c7f4185567ff0b Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 11 Feb 2022 23:32:36 +0100
Subject: ARM: ixp4xx: Drop custom DMA coherency and bouncing

The new PCI driver does not need any of this stuff, so just
drop it.

Cc: iommu@lists.linux-foundation.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220211223238.648934-12-linus.walleij@linaro.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 kernel/dma/mapping.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..559461a826ba 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -745,7 +745,6 @@ int dma_set_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_mask);
 
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
 int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	/*
@@ -761,7 +760,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 	return 0;
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
 
 size_t dma_max_mapping_size(struct device *dev)
 {
-- 
cgit v1.2.3


From 77498617857f68496b360081dde1a492d40c28b2 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Wed, 2 Feb 2022 09:18:18 -0800
Subject: printk: Add panic_in_progress helper

This will be used help avoid deadlocks during panics. Although it would
be better to include this in linux/panic.h, it would require that header
to include linux/atomic.h as well. On some architectures, this results
in a circular dependency as well. So instead add the helper directly to
printk.c.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220202171821.179394-2-stephen.s.brennan@oracle.com
---
 kernel/printk/printk.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 155229f0cf0f..f04bbed0aa79 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -256,6 +256,11 @@ static void __up_console_sem(unsigned long ip)
 }
 #define up_console_sem() __up_console_sem(_RET_IP_)
 
+static bool panic_in_progress(void)
+{
+	return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
-- 
cgit v1.2.3


From d51507098ff91e863b6e0a8047507741d59b8175 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Wed, 2 Feb 2022 09:18:19 -0800
Subject: printk: disable optimistic spin during panic

A CPU executing with console lock spinning enabled might be halted
during a panic. Before the panicking CPU calls console_flush_on_panic(),
it may call console_trylock(), which attempts to optimistically spin,
deadlocking the panic CPU:

CPU 0 (panic CPU)             CPU 1
-----------------             ------
                              printk() {
                                vprintk_func() {
                                  vprintk_default() {
                                    vprintk_emit() {
                                      console_unlock() {
                                        console_lock_spinning_enable();
                                        ... printing to console ...
panic() {
  crash_smp_send_stop() {
    NMI  -------------------> HALT
  }
  atomic_notifier_call_chain() {
    printk() {
      ...
      console_trylock_spinnning() {
        // optimistic spin infinitely

This hang during panic can be induced when a kdump kernel is loaded, and
crash_kexec_post_notifiers=1 is present on the kernel command line. The
following script which concurrently writes to /dev/kmsg, and triggers a
panic, can result in this hang:

    #!/bin/bash
    date
    # 991 chars (based on log buffer size):
    chars="$(printf 'a%.0s' {1..991})"
    while :; do
        echo $chars > /dev/kmsg
    done &
    echo c > /proc/sysrq-trigger &
    date
    exit

To avoid this deadlock, ensure that console_trylock_spinning() does not
allow spinning once a panic has begun.

Fixes: dbdda842fe96 ("printk: Add console owner and waiter logic to load balance console writes")

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220202171821.179394-3-stephen.s.brennan@oracle.com
---
 kernel/printk/printk.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f04bbed0aa79..e83c12770104 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1847,6 +1847,16 @@ static int console_trylock_spinning(void)
 	if (console_trylock())
 		return 1;
 
+	/*
+	 * It's unsafe to spin once a panic has begun. If we are the
+	 * panic CPU, we may have already halted the owner of the
+	 * console_sem. If we are not the panic CPU, then we should
+	 * avoid taking console_sem, so the panic CPU has a better
+	 * chance of cleanly acquiring it later.
+	 */
+	if (panic_in_progress())
+		return 0;
+
 	printk_safe_enter_irqsave(flags);
 
 	raw_spin_lock(&console_owner_lock);
-- 
cgit v1.2.3


From 13fb0f74d7029df3b8137f11ef955e578a4a4a60 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Wed, 2 Feb 2022 09:18:20 -0800
Subject: printk: Avoid livelock with heavy printk during panic

During panic(), if another CPU is writing heavily the kernel log (e.g.
via /dev/kmsg), then the panic CPU may livelock writing out its messages
to the console. Note when too many messages are dropped during panic and
suppress further printk, except from the panic CPU. This could result in
some important messages being dropped. However, messages are already
being dropped, so this approach at least prevents a livelock.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220202171821.179394-4-stephen.s.brennan@oracle.com
---
 kernel/printk/printk.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e83c12770104..2ec6b547cda6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers);
  */
 int __read_mostly suppress_printk;
 
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+int __read_mostly suppress_panic_printk;
+
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
 	.name = "console_lock"
@@ -2232,6 +2238,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	if (unlikely(suppress_printk))
 		return 0;
 
+	if (unlikely(suppress_panic_printk) &&
+	    atomic_read(&panic_cpu) != raw_smp_processor_id())
+		return 0;
+
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
 		in_sched = true;
@@ -2617,6 +2627,7 @@ void console_unlock(void)
 {
 	static char ext_text[CONSOLE_EXT_LOG_MAX];
 	static char text[CONSOLE_LOG_MAX];
+	static int panic_console_dropped;
 	unsigned long flags;
 	bool do_cond_resched, retry;
 	struct printk_info info;
@@ -2671,6 +2682,10 @@ skip:
 		if (console_seq != r.info->seq) {
 			console_dropped += r.info->seq - console_seq;
 			console_seq = r.info->seq;
+			if (panic_in_progress() && panic_console_dropped++ > 10) {
+				suppress_panic_printk = 1;
+				pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+			}
 		}
 
 		if (suppress_message_printing(r.info->level)) {
-- 
cgit v1.2.3


From 8ebc476fd51e6c0fd3174ec1959a20ba99d4c5e5 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Wed, 2 Feb 2022 09:18:21 -0800
Subject: printk: Drop console_sem during panic

If another CPU is in panic, we are about to be halted. Try to gracefully
abandon the console_sem, leaving it free for the panic CPU to grab.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220202171821.179394-5-stephen.s.brennan@oracle.com
---
 kernel/printk/printk.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2ec6b547cda6..6a51907a33b9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2597,6 +2597,25 @@ static int have_callable_console(void)
 	return 0;
 }
 
+/*
+ * Return true when this CPU should unlock console_sem without pushing all
+ * messages to the console. This reduces the chance that the console is
+ * locked when the panic CPU tries to use it.
+ */
+static bool abandon_console_lock_in_panic(void)
+{
+	if (!panic_in_progress())
+		return false;
+
+	/*
+	 * We can use raw_smp_processor_id() here because it is impossible for
+	 * the task to be migrated to the panic_cpu, or away from it. If
+	 * panic_cpu has already been set, and we're not currently executing on
+	 * that CPU, then we never will be.
+	 */
+	return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
 /*
  * Can we actually use the console at this time on this cpu?
  *
@@ -2745,6 +2764,10 @@ skip:
 		if (handover)
 			return;
 
+		/* Allow panic_cpu to take over the consoles safely */
+		if (abandon_console_lock_in_panic())
+			break;
+
 		if (do_cond_resched)
 			cond_resched();
 	}
@@ -2762,7 +2785,7 @@ skip:
 	 * flush, no worries.
 	 */
 	retry = prb_read_valid(prb, next_seq, NULL);
-	if (retry && console_trylock())
+	if (retry && !abandon_console_lock_in_panic() && console_trylock())
 		goto again;
 }
 EXPORT_SYMBOL(console_unlock);
-- 
cgit v1.2.3


From 7a853c2d5951419fdf3c1c9d2b6f5a38f6a6857d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:45 -0800
Subject: mm: Change CONFIG option for mm->pasid field

This currently depends on CONFIG_IOMMU_SUPPORT. But it is only
needed when CONFIG_IOMMU_SVA option is enabled.

Change the CONFIG guards around definition and initialization
of mm->pasid field.

Suggested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220207230254.3342514-3-fenghua.yu@intel.com
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..6ee7551d3bd2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,7 +1021,7 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 
 static void mm_init_pasid(struct mm_struct *mm)
 {
-#ifdef CONFIG_IOMMU_SUPPORT
+#ifdef CONFIG_IOMMU_SVA
 	mm->pasid = INIT_PASID;
 #endif
 }
-- 
cgit v1.2.3


From 218b957a6959a2fb5b3967fc824072bb89ac2611 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 8 Dec 2021 23:41:53 +0000
Subject: rcu: Add mutex for rcu boost kthread spawning and affinity setting

As we handle parallel CPU bringup, we will need to take care to avoid
spawning multiple boost threads, or race conditions when setting their
affinity. Spotted by Paul McKenney.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        |  1 +
 kernel/rcu/tree.h        |  3 +++
 kernel/rcu/tree_plugin.h | 10 ++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..d1d1a8c51223 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4570,6 +4570,7 @@ static void __init rcu_init_one(void)
 			init_waitqueue_head(&rnp->exp_wq[2]);
 			init_waitqueue_head(&rnp->exp_wq[3]);
 			spin_lock_init(&rnp->exp_lock);
+			mutex_init(&rnp->boost_kthread_mutex);
 		}
 	}
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 486fc901bd08..3b8b60de07c3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -110,6 +110,9 @@ struct rcu_node {
 				/*  side effect, not as a lock. */
 	unsigned long boost_time;
 				/* When to start boosting (jiffies). */
+	struct mutex boost_kthread_mutex;
+				/* Exclusion for thread spawning and affinity */
+				/*  manipulation. */
 	struct task_struct *boost_kthread_task;
 				/* kthread that takes care of priority */
 				/*  boosting for this rcu_node structure. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..07845dcd33c5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1172,15 +1172,16 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	struct sched_param sp;
 	struct task_struct *t;
 
+	mutex_lock(&rnp->boost_kthread_mutex);
 	if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
-		return;
+		goto out;
 
 	rcu_state.boost = 1;
 
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
 			   "rcub/%d", rnp_index);
 	if (WARN_ON_ONCE(IS_ERR(t)))
-		return;
+		goto out;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
@@ -1188,6 +1189,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
 	sp.sched_priority = kthread_prio;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+	mutex_unlock(&rnp->boost_kthread_mutex);
 }
 
 /*
@@ -1210,6 +1214,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		return;
 	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
 		return;
+	mutex_lock(&rnp->boost_kthread_mutex);
 	for_each_leaf_node_possible_cpu(rnp, cpu)
 		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
 		    cpu != outgoingcpu)
@@ -1218,6 +1223,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	if (cpumask_weight(cm) == 0)
 		cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
 	set_cpus_allowed_ptr(t, cm);
+	mutex_unlock(&rnp->boost_kthread_mutex);
 	free_cpumask_var(cm);
 }
 
-- 
cgit v1.2.3


From 1fe09ebe7a9c9907f516779fbe4954165dd01529 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sat, 18 Dec 2021 09:30:33 -0800
Subject: rcu: Inline __call_rcu() into call_rcu()

Because __call_rcu() is invoked only by call_rcu(), this commit inlines
the former into the latter.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 91 +++++++++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d1d1a8c51223..f1bb7ccc0084 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2995,9 +2995,47 @@ static void check_cb_ovld(struct rcu_data *rdp)
 	raw_spin_unlock_rcu_node(rnp);
 }
 
-/* Helper function for call_rcu() and friends.  */
-static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section.  On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu().  It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section.  Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	static atomic_t doublefrees;
 	unsigned long flags;
@@ -3011,7 +3049,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
 		/*
 		 * Probable double call_rcu(), so leak the callback.
 		 * Use rcu:rcu_callback trace event to find the previous
-		 * time callback was passed to __call_rcu().
+		 * time callback was passed to call_rcu().
 		 */
 		if (atomic_inc_return(&doublefrees) < 4) {
 			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
@@ -3060,51 +3098,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
 		local_irq_restore(flags);
 	}
 }
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested.  In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections.  This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section.  On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu().  It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section.  Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
-	__call_rcu(head, func);
-}
 EXPORT_SYMBOL_GPL(call_rcu);
 
 
-- 
cgit v1.2.3


From d818cc76e2b4d5f6cebf8c7ce1160d652d7e572b Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Sun, 26 Dec 2021 08:52:04 +0800
Subject: kasan: Record work creation stack trace with interrupts enabled

Recording the work creation stack trace for KASAN reports in
call_rcu() is expensive, due to unwinding the stack, but also
due to acquiring depot_lock inside stackdepot (which may be contended).
Because calling kasan_record_aux_stack_noalloc() does not require
interrupts to already be disabled, this may unnecessarily extend
the time with interrupts disabled.

Therefore, move calling kasan_record_aux_stack() before the section
with interrupts disabled.

Acked-by: Marco Elver <elver@google.com>
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1bb7ccc0084..ca8d7dd026ee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3060,8 +3060,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 	}
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(flags);
 	kasan_record_aux_stack_noalloc(head);
+	local_irq_save(flags);
 	rdp = this_cpu_ptr(&rcu_data);
 
 	/* Add the callback to our list. */
-- 
cgit v1.2.3


From c09929031018913b5783872a8b8cdddef4a543c7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 4 Jan 2022 10:34:34 -0800
Subject: rcu: Mark writes to the rcu_segcblist structure's ->flags field

KCSAN reports data races between the rcu_segcblist_clear_flags() and
rcu_segcblist_set_flags() functions, though misreporting the latter
as a call to rcu_segcblist_is_enabled() from call_rcu().  This commit
converts the updates of this field to WRITE_ONCE(), relying on the
resulting unmarked reads to continue to detect buggy concurrent writes
to this field.

Reported-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcu_segcblist.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index e373fbe44da5..431cee212467 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
 static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
 					   int flags)
 {
-	rsclp->flags |= flags;
+	WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
 }
 
 static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
 					     int flags)
 {
-	rsclp->flags &= ~flags;
+	WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
 }
 
 static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
-- 
cgit v1.2.3


From 58d4292bd037b01fbb940a5170817f7d40caa9d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 14 Jan 2022 16:07:28 -0800
Subject: rcu: Uninline multi-use function: finish_rcuwait()

This is a rarely used function, so uninlining its 3 instructions
is probably a win or a wash - but the main motivation is to
make <linux/rcuwait.h> independent of task_struct details.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 156892c22bb5..180ff9c41fa8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 }
 EXPORT_SYMBOL_GPL(__wait_rcu_gp);
 
+void finish_rcuwait(struct rcuwait *w)
+{
+	rcu_assign_pointer(w->task, NULL);
+	__set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 void init_rcu_head(struct rcu_head *head)
 {
-- 
cgit v1.2.3


From 6a2c1d450a6a328027280a854019c55de989e14e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 23 Jan 2022 10:38:53 -0800
Subject: rcu: Replace cpumask_weight with cpumask_empty where appropriate

In some places, RCU code calls cpumask_weight() to check if any bit of a
given cpumask is set. We can do it more efficiently with cpumask_empty()
because cpumask_empty() stops traversing the cpumask as soon as it finds
first set bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h   | 4 ++--
 kernel/rcu/tree_plugin.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index eeafb546a7a0..f83c7b1d6110 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1169,7 +1169,7 @@ void __init rcu_init_nohz(void)
 	struct rcu_data *rdp;
 
 #if defined(CONFIG_NO_HZ_FULL)
-	if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+	if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
 		need_rcu_nocb_mask = true;
 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
 
@@ -1348,7 +1348,7 @@ static void __init rcu_organize_nocb_kthreads(void)
  */
 void rcu_bind_current_to_nocb(void)
 {
-	if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+	if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
 		WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
 }
 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 07845dcd33c5..efd0c87d2ffa 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1220,7 +1220,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		    cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
 	cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
-	if (cpumask_weight(cm) == 0)
+	if (cpumask_empty(cm))
 		cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
 	set_cpus_allowed_ptr(t, cm);
 	mutex_unlock(&rnp->boost_kthread_mutex);
-- 
cgit v1.2.3


From a6cbd44093ef305b02ad5f80ed54abf0148a696c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:47 -0800
Subject: kernel/fork: Initialize mm's PASID

A new mm doesn't have a PASID yet when it's created. Initialize
the mm's PASID on fork() or for init_mm to INVALID_IOASID (-1).

INIT_PASID (0) is reserved for kernel legacy DMA PASID. It cannot be
allocated to a user process. Initializing the process's PASID to 0 may
cause confusion that's why the process uses the reserved kernel legacy
DMA PASID. Initializing the PASID to INVALID_IOASID (-1) explicitly
tells the process doesn't have a valid PASID yet.

Even though the only user of mm_pasid_init() is in fork.c, define it in
<linux/sched/mm.h> as the first of three mm/pasid life cycle functions
(init/set/drop) to keep these all together.

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-5-fenghua.yu@intel.com
---
 kernel/fork.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6ee7551d3bd2..deacd2c17a7f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
+#include <linux/sched/mm.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1019,13 +1020,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SVA
-	mm->pasid = INIT_PASID;
-#endif
-}
-
 static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
@@ -1054,7 +1048,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	mm_init_pasid(mm);
+	mm_pasid_init(mm);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_subscriptions_init(mm);
 	init_tlb_flush_pending(mm);
-- 
cgit v1.2.3


From 701fac40384f07197b106136012804c3cae0b3de Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Mon, 7 Feb 2022 15:02:48 -0800
Subject: iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm
 exit

PASIDs are process-wide. It was attempted to use refcounted PASIDs to
free them when the last thread drops the refcount. This turned out to
be complex and error prone. Given the fact that the PASID space is 20
bits, which allows up to 1M processes to have a PASID associated
concurrently, PASID resource exhaustion is not a realistic concern.

Therefore, it was decided to simplify the approach and stick with lazy
on demand PASID allocation, but drop the eager free approach and make an
allocated PASID's lifetime bound to the lifetime of the process.

Get rid of the refcounting mechanisms and replace/rename the interfaces
to reflect this new approach.

  [ bp: Massage commit message. ]

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-6-fenghua.yu@intel.com
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index deacd2c17a7f..c03c6682464c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1115,6 +1115,7 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	mm_pasid_drop(mm);
 	mmdrop(mm);
 }
 
-- 
cgit v1.2.3


From a3d29e8291b622780eb6e4e3eeaf2b24ec78fd43 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 7 Feb 2022 15:02:50 -0800
Subject: sched: Define and initialize a flag to identify valid PASID in the
 task

Add a new single bit field to the task structure to track whether this task
has initialized the IA32_PASID MSR to the mm's PASID.

Initialize the field to zero when creating a new task with fork/clone.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220207230254.3342514-8-fenghua.yu@intel.com
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index c03c6682464c..51fd1df994b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -968,6 +968,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->use_memdelay = 0;
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+	tsk->pasid_activated = 0;
+#endif
+
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
-- 
cgit v1.2.3


From 45ec846c1cd11835a29c85645065115dd791aa45 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 9 Feb 2022 16:25:58 +0000
Subject: irqdomain: Let irq_domain_set_{info,hwirq_and_chip} take a const
 irq_chip

In order to let a const irqchip be fed to the irqchip layer, adjust
the various prototypes. An extra cast in irq_domain_set_hwirq_and_chip()
is required to avoid a warning.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220209162607.1118325-2-maz@kernel.org
---
 kernel/irq/irqdomain.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index bf38c546aa25..d5ce96510549 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
  * @chip_data:	The associated chip data
  */
 int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
-				  irq_hw_number_t hwirq, struct irq_chip *chip,
+				  irq_hw_number_t hwirq,
+				  const struct irq_chip *chip,
 				  void *chip_data)
 {
 	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
 		return -ENOENT;
 
 	irq_data->hwirq = hwirq;
-	irq_data->chip = chip ? chip : &no_irq_chip;
+	irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
 	irq_data->chip_data = chip_data;
 
 	return 0;
@@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
  * @handler_name:	The interrupt handler name
  */
 void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
-			 irq_hw_number_t hwirq, struct irq_chip *chip,
+			 irq_hw_number_t hwirq, const struct irq_chip *chip,
 			 void *chip_data, irq_flow_handler_t handler,
 			 void *handler_data, const char *handler_name)
 {
@@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
  * @handler_name:	The interrupt handler name
  */
 void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
-			 irq_hw_number_t hwirq, struct irq_chip *chip,
+			 irq_hw_number_t hwirq, const struct irq_chip *chip,
 			 void *chip_data, irq_flow_handler_t handler,
 			 void *handler_data, const char *handler_name)
 {
-- 
cgit v1.2.3


From 393e1280f765661cf39785e967676a4e57324126 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 9 Feb 2022 16:25:59 +0000
Subject: genirq: Allow irq_chip registration functions to take a const
 irq_chip

In order to let a const irqchip be fed to the irqchip layer, adjust
the various prototypes. An extra cast in irq_set_chip()() is required
to avoid a warning.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20220209162607.1118325-3-maz@kernel.org
---
 kernel/irq/chip.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 24b6f2b40e5e..54af0deb239b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
  *	@irq:	irq number
  *	@chip:	pointer to irq chip description structure
  */
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
 {
 	unsigned long flags;
 	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 	if (!desc)
 		return -EINVAL;
 
-	if (!chip)
-		chip = &no_irq_chip;
-
-	desc->irq_data.chip = chip;
+	desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
 	irq_put_desc_unlock(desc, flags);
 	/*
 	 * For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -1073,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
 EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
 
 void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
 {
 	irq_set_chip(irq, chip);
-- 
cgit v1.2.3


From 0a25cb5544f4f01d2e7c06164555fd9cd6eb64fd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 10 Feb 2022 09:27:21 +0000
Subject: genirq/debugfs: Use irq_print_chip() when provided by irqchip

Since irqchips have the option to implement irq_print_chip, use this
when available to output the irqchip name in debugfs.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220215112154.1360040-1-maz@kernel.org
---
 kernel/irq/debugfs.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..2b43f5f5033d 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
 		seq_printf(m, "chip: None\n");
 		return;
 	}
-	seq_printf(m, "%*schip:    %s\n", ind, "", chip->name);
-	seq_printf(m, "%*sflags:   0x%lx\n", ind + 1, "", chip->flags);
+	seq_printf(m, "%*schip:    ", ind, "");
+	if (chip->irq_print_chip)
+		chip->irq_print_chip(data, m);
+	else
+		seq_printf(m, "%s", chip->name);
+	seq_printf(m, "\n%*sflags:   0x%lx\n", ind + 1, "", chip->flags);
 	irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
 			    ARRAY_SIZE(irqchip_flags));
 }
-- 
cgit v1.2.3


From 2ba3673d70178bf07fb75ff25c54bc478add4021 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 11 Feb 2022 12:29:37 +0106
Subject: printk: use atomic updates for klogd work

The per-cpu @printk_pending variable can be updated from
sleepable contexts, such as:

  get_random_bytes()
    warn_unseeded_randomness()
      printk_deferred()
        defer_console_output()

and can be updated from interrupt contexts, such as:

  handle_irq_event_percpu()
    __irq_wake_thread()
      wake_up_process()
        try_to_wake_up()
          select_task_rq()
            select_fallback_rq()
              printk_deferred()
                defer_console_output()

and can be updated from NMI contexts, such as:

  vprintk()
    if (in_nmi()) defer_console_output()

Therefore the atomic variant of the updating functions must be used.

Replace __this_cpu_xchg() with this_cpu_xchg().
Replace __this_cpu_or() with this_cpu_or().

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/87iltld4ue.fsf@jogness.linutronix.de
---
 kernel/printk/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 155229f0cf0f..25dce8b74791 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3226,7 +3226,7 @@ static DEFINE_PER_CPU(int, printk_pending);
 
 static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
-	int pending = __this_cpu_xchg(printk_pending, 0);
+	int pending = this_cpu_xchg(printk_pending, 0);
 
 	if (pending & PRINTK_PENDING_OUTPUT) {
 		/* If trylock fails, someone else is doing the printing */
@@ -3260,7 +3260,7 @@ void defer_console_output(void)
 		return;
 
 	preempt_disable();
-	__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
+	this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
 	irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 8cbf062a250ed52148badf6f3ffd03657dd4a3f0 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 15 Feb 2022 14:57:32 +0800
Subject: bpf: Reject kfunc calls that overflow insn->imm

Now kfunc call uses s32 to represent the offset between the address of
kfunc and __bpf_call_base, but it doesn't check whether or not s32 will
be overflowed. The overflow is possible when kfunc is in module and the
offset between module and kernel is greater than 2GB. Take arm64 as an
example, before commit b2eed9b58811 ("arm64/kernel: kaslr: reduce module
randomization range to 2 GB"), the offset between module symbol and
__bpf_call_base will in 4GB range due to KASLR and may overflow s32.

So add an extra checking to reject these invalid kfunc calls.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220215065732.3179408-1-houtao1@huawei.com
---
 kernel/bpf/verifier.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bbef86cb4e72..d7473fee247c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1842,6 +1842,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 	struct bpf_kfunc_desc *desc;
 	const char *func_name;
 	struct btf *desc_btf;
+	unsigned long call_imm;
 	unsigned long addr;
 	int err;
 
@@ -1926,9 +1927,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 		return -EINVAL;
 	}
 
+	call_imm = BPF_CALL_IMM(addr);
+	/* Check whether or not the relative offset overflows desc->imm */
+	if ((unsigned long)(s32)call_imm != call_imm) {
+		verbose(env, "address of kernel function %s is out of range\n",
+			func_name);
+		return -EINVAL;
+	}
+
 	desc = &tab->descs[tab->nr_descs++];
 	desc->func_id = func_id;
-	desc->imm = BPF_CALL_IMM(addr);
+	desc->imm = call_imm;
 	desc->offset = offset;
 	err = btf_distill_func_proto(&env->log, desc_btf,
 				     func_proto, func_name,
-- 
cgit v1.2.3


From e1478d8eaf27704db17a44dee4c53696ed01fc9c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 15 Feb 2022 13:41:04 +0100
Subject: asm-generic: Refactor dereference_[kernel]_function_descriptor()

dereference_function_descriptor() and
dereference_kernel_function_descriptor() are identical on the
three architectures implementing them.

Make them common and put them out-of-line in kernel/extable.c
which is one of the users and has similar type of functions.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Helge Deller <deller@gmx.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/449db09b2eba57f4ab05f80102a67d8675bc8bcd.1644928018.git.christophe.leroy@csgroup.eu
---
 kernel/extable.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index b6f330f0fe74..394c39b86e38 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
    Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
 
 */
+#include <linux/elf.h>
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/extable.h>
@@ -132,12 +133,32 @@ out:
 }
 
 /*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
  * are actually only tokens to some data that then holds the
  * real function address. As a result, to find if a function
  * pointer is part of the kernel text, we need to do some
  * special dereferencing first.
  */
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+	func_desc_t *desc = ptr;
+	void *p;
+
+	if (!get_kernel_nofault(p, (void *)&desc->addr))
+		ptr = p;
+	return ptr;
+}
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+	if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+		return ptr;
+
+	return dereference_function_descriptor(ptr);
+}
+#endif
+
 int func_ptr_is_kernel_text(void *ptr)
 {
 	unsigned long addr;
-- 
cgit v1.2.3


From b64913394f123e819bffabc79a0e48f98e78dc5d Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 15 Feb 2022 13:41:06 +0100
Subject: lkdtm: Really write into kernel text in WRITE_KERN

WRITE_KERN is supposed to overwrite some kernel text, namely
do_overwritten() function.

But at the time being it overwrites do_overwritten() function
descriptor, not function text.

Fix it by dereferencing the function descriptor to obtain
function text pointer. Export dereference_function_descriptor()
for when LKDTM is built as a module.

And make do_overwritten() noinline so that it is really
do_overwritten() which is called by lkdtm_WRITE_KERN().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/31e58eaffb5bc51c07d8d4891d1982100ade8cfc.1644928018.git.christophe.leroy@csgroup.eu
---
 kernel/extable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 394c39b86e38..bda5e9761541 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -149,6 +149,7 @@ void *dereference_function_descriptor(void *ptr)
 		ptr = p;
 	return ptr;
 }
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
 
 void *dereference_kernel_function_descriptor(void *ptr)
 {
-- 
cgit v1.2.3


From 1087ad4e3f88c474b8134a482720782922bf3fdf Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 10 Feb 2022 14:49:06 -0800
Subject: sched: replace cpumask_weight with cpumask_empty where appropriate

In some places, kernel/sched code calls cpumask_weight() to check if
any bit of a given cpumask is set. We can do it more efficiently with
cpumask_empty() because cpumask_empty() stops traversing the cpumask as
soon as it finds first set bit, while cpumask_weight() counts all bits
unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220210224933.379149-23-yury.norov@gmail.com
---
 kernel/sched/core.c     | 2 +-
 kernel/sched/topology.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d863d7f6ad7..c620aab27acf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8708,7 +8708,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 {
 	int ret = 1;
 
-	if (!cpumask_weight(cur))
+	if (cpumask_empty(cur))
 		return ret;
 
 	ret = dl_cpuset_cpumask_can_shrink(cur, trial);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e6cd55951304..1c84b48def20 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -74,7 +74,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!cpumask_weight(sched_group_span(group))) {
+		if (cpumask_empty(sched_group_span(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
-- 
cgit v1.2.3


From 0fb3978b0aac3a5c08637aed03cc2d65f793508f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 14 Feb 2022 20:15:52 +0800
Subject: sched/numa: Fix NUMA topology for systems with CPU-less nodes

The NUMA topology parameters (sched_numa_topology_type,
sched_domains_numa_levels, and sched_max_numa_distance, etc.)
identified by scheduler may be wrong for systems with CPU-less nodes.

For example, the ACPI SLIT of a system with CPU-less persistent
memory (Intel Optane DCPMM) nodes is as follows,

[000h 0000   4]                    Signature : "SLIT"    [System Locality Information Table]
[004h 0004   4]                 Table Length : 0000042C
[008h 0008   1]                     Revision : 01
[009h 0009   1]                     Checksum : 59
[00Ah 0010   6]                       Oem ID : "XXXX"
[010h 0016   8]                 Oem Table ID : "XXXXXXX"
[018h 0024   4]                 Oem Revision : 00000001
[01Ch 0028   4]              Asl Compiler ID : "INTL"
[020h 0032   4]        Asl Compiler Revision : 20091013

[024h 0036   8]                   Localities : 0000000000000004
[02Ch 0044   4]                 Locality   0 : 0A 15 11 1C
[030h 0048   4]                 Locality   1 : 15 0A 1C 11
[034h 0052   4]                 Locality   2 : 11 1C 0A 1C
[038h 0056   4]                 Locality   3 : 1C 11 1C 0A

While the `numactl -H` output is as follows,

available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 0 size: 64136 MB
node 0 free: 5981 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 1 size: 64466 MB
node 1 free: 10415 MB
node 2 cpus:
node 2 size: 253952 MB
node 2 free: 253920 MB
node 3 cpus:
node 3 size: 253952 MB
node 3 free: 253951 MB
node distances:
node   0   1   2   3
  0:  10  21  17  28
  1:  21  10  28  17
  2:  17  28  10  28
  3:  28  17  28  10

In this system, there are only 2 sockets.  In each memory controller,
both DRAM and PMEM DIMMs are installed.  Although the physical NUMA
topology is simple, the logical NUMA topology becomes a little
complex.  Because both the distance(0, 1) and distance (1, 3) are less
than the distance (0, 3), it appears that node 1 sits between node 0
and node 3.  And the whole system appears to be a glueless mesh NUMA
topology type.  But it's definitely not, there is even no CPU in node 3.

This isn't a practical problem now yet.  Because the PMEM nodes (node
2 and node 3 in example system) are offlined by default during system
boot.  So init_numa_topology_type() called during system boot will
ignore them and set sched_numa_topology_type to NUMA_DIRECT.  And
init_numa_topology_type() is only called at runtime when a CPU of a
never-onlined-before node gets plugged in.  And there's no CPU in the
PMEM nodes.  But it appears better to fix this to make the code more
robust.

To test the potential problem.  We have used a debug patch to call
init_numa_topology_type() when the PMEM node is onlined (in
__set_migration_target_nodes()).  With that, the NUMA parameters
identified by scheduler is as follows,

sched_numa_topology_type:	NUMA_GLUELESS_MESH
sched_domains_numa_levels:	4
sched_max_numa_distance:	28

To fix the issue, the CPU-less nodes are ignored when the NUMA topology
parameters are identified.  Because a node may become CPU-less or not
at run time because of CPU hotplug, the NUMA topology parameters need
to be re-initialized at runtime for CPU hotplug too.

With the patch, the NUMA parameters identified for the example system
above is as follows,

sched_numa_topology_type:	NUMA_DIRECT
sched_domains_numa_levels:	2
sched_max_numa_distance:	21

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220214121553.582248-1-ying.huang@intel.com
---
 kernel/sched/core.c     |   5 +-
 kernel/sched/fair.c     |  15 ++--
 kernel/sched/sched.h    |   6 +-
 kernel/sched/topology.c | 206 ++++++++++++++++++++++++++++--------------------
 4 files changed, 137 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c620aab27acf..b2226922206d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9052,6 +9052,7 @@ int sched_cpu_activate(unsigned int cpu)
 	set_cpu_active(cpu, true);
 
 	if (sched_smp_initialized) {
+		sched_update_numa(cpu, true);
 		sched_domains_numa_masks_set(cpu);
 		cpuset_cpu_active();
 	}
@@ -9130,10 +9131,12 @@ int sched_cpu_deactivate(unsigned int cpu)
 	if (!sched_smp_initialized)
 		return 0;
 
+	sched_update_numa(cpu, false);
 	ret = cpuset_cpu_inactive(cpu);
 	if (ret) {
 		balance_push_set(cpu, false);
 		set_cpu_active(cpu, true);
+		sched_update_numa(cpu, true);
 		return ret;
 	}
 	sched_domains_numa_masks_clear(cpu);
@@ -9236,7 +9239,7 @@ int sched_cpu_dying(unsigned int cpu)
 
 void __init sched_init_smp(void)
 {
-	sched_init_numa();
+	sched_init_numa(NUMA_NO_NODE);
 
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c4bfffe8c2c..da3230b84250 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1259,10 +1259,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
 
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
-					int maxdist, bool task)
+					int lim_dist, bool task)
 {
 	unsigned long score = 0;
-	int node;
+	int node, max_dist;
 
 	/*
 	 * All nodes are directly connected, and the same distance
@@ -1271,6 +1271,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 	if (sched_numa_topology_type == NUMA_DIRECT)
 		return 0;
 
+	/* sched_max_numa_distance may be changed in parallel. */
+	max_dist = READ_ONCE(sched_max_numa_distance);
 	/*
 	 * This code is called for each node, introducing N^2 complexity,
 	 * which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1285,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 		 * The furthest away nodes in the system are not interesting
 		 * for placement; nid was already counted.
 		 */
-		if (dist == sched_max_numa_distance || node == nid)
+		if (dist >= max_dist || node == nid)
 			continue;
 
 		/*
@@ -1293,8 +1295,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 		 * "hoplimit", only nodes closer by than "hoplimit" are part
 		 * of each group. Skip other nodes.
 		 */
-		if (sched_numa_topology_type == NUMA_BACKPLANE &&
-					dist >= maxdist)
+		if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
 			continue;
 
 		/* Add up the faults from nearby nodes. */
@@ -1312,8 +1313,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 		 * This seems to result in good task placement.
 		 */
 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
-			faults *= (sched_max_numa_distance - dist);
-			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+			faults *= (max_dist - dist);
+			faults /= (max_dist - LOCAL_DISTANCE);
 		}
 
 		score += faults;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9b33ba9c3c42..3da5718cd641 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1662,12 +1662,14 @@ enum numa_topology_type {
 extern enum numa_topology_type sched_numa_topology_type;
 extern int sched_max_numa_distance;
 extern bool find_numa_distance(int distance);
-extern void sched_init_numa(void);
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
 extern void sched_domains_numa_masks_set(unsigned int cpu);
 extern void sched_domains_numa_masks_clear(unsigned int cpu);
 extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
 #else
-static inline void sched_init_numa(void) { }
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
 static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
 static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
 static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1c84b48def20..5db322c9cb3f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1492,8 +1492,6 @@ static int			sched_domains_curr_level;
 int				sched_max_numa_distance;
 static int			*sched_domains_numa_distance;
 static struct cpumask		***sched_domains_numa_masks;
-
-static unsigned long __read_mostly *sched_numa_onlined_nodes;
 #endif
 
 /*
@@ -1651,6 +1649,7 @@ static struct sched_domain_topology_level default_topology[] = {
 
 static struct sched_domain_topology_level *sched_domain_topology =
 	default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
 
 #define for_each_sd_topology(tl)			\
 	for (tl = sched_domain_topology; tl->mask; tl++)
@@ -1661,6 +1660,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)
 		return;
 
 	sched_domain_topology = tl;
+	sched_domain_topology_saved = NULL;
 }
 
 #ifdef CONFIG_NUMA
@@ -1684,8 +1684,12 @@ static void sched_numa_warn(const char *str)
 
 	for (i = 0; i < nr_node_ids; i++) {
 		printk(KERN_WARNING "  ");
-		for (j = 0; j < nr_node_ids; j++)
-			printk(KERN_CONT "%02d ", node_distance(i,j));
+		for (j = 0; j < nr_node_ids; j++) {
+			if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+				printk(KERN_CONT "(%02d) ", node_distance(i,j));
+			else
+				printk(KERN_CONT " %02d  ", node_distance(i,j));
+		}
 		printk(KERN_CONT "\n");
 	}
 	printk(KERN_WARNING "\n");
@@ -1693,19 +1697,34 @@ static void sched_numa_warn(const char *str)
 
 bool find_numa_distance(int distance)
 {
-	int i;
+	bool found = false;
+	int i, *distances;
 
 	if (distance == node_distance(0, 0))
 		return true;
 
+	rcu_read_lock();
+	distances = rcu_dereference(sched_domains_numa_distance);
+	if (!distances)
+		goto unlock;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
-		if (sched_domains_numa_distance[i] == distance)
-			return true;
+		if (distances[i] == distance) {
+			found = true;
+			break;
+		}
 	}
+unlock:
+	rcu_read_unlock();
 
-	return false;
+	return found;
 }
 
+#define for_each_cpu_node_but(n, nbut)		\
+	for_each_node_state(n, N_CPU)		\
+		if (n == nbut)			\
+			continue;		\
+		else
+
 /*
  * A system can have three types of NUMA topology:
  * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
@@ -1725,7 +1744,7 @@ bool find_numa_distance(int distance)
  *   there is an intermediary node C, which is < N hops away from both
  *   nodes A and B, the system is a glueless mesh.
  */
-static void init_numa_topology_type(void)
+static void init_numa_topology_type(int offline_node)
 {
 	int a, b, c, n;
 
@@ -1736,14 +1755,14 @@ static void init_numa_topology_type(void)
 		return;
 	}
 
-	for_each_online_node(a) {
-		for_each_online_node(b) {
+	for_each_cpu_node_but(a, offline_node) {
+		for_each_cpu_node_but(b, offline_node) {
 			/* Find two nodes furthest removed from each other. */
 			if (node_distance(a, b) < n)
 				continue;
 
 			/* Is there an intermediary node between a and b? */
-			for_each_online_node(c) {
+			for_each_cpu_node_but(c, offline_node) {
 				if (node_distance(a, c) < n &&
 				    node_distance(b, c) < n) {
 					sched_numa_topology_type =
@@ -1756,17 +1775,22 @@ static void init_numa_topology_type(void)
 			return;
 		}
 	}
+
+	pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+	sched_numa_topology_type = NUMA_DIRECT;
 }
 
 
 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
 
-void sched_init_numa(void)
+void sched_init_numa(int offline_node)
 {
 	struct sched_domain_topology_level *tl;
 	unsigned long *distance_map;
 	int nr_levels = 0;
 	int i, j;
+	int *distances;
+	struct cpumask ***masks;
 
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
@@ -1777,12 +1801,13 @@ void sched_init_numa(void)
 		return;
 
 	bitmap_zero(distance_map, NR_DISTANCE_VALUES);
-	for (i = 0; i < nr_node_ids; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
+	for_each_cpu_node_but(i, offline_node) {
+		for_each_cpu_node_but(j, offline_node) {
 			int distance = node_distance(i, j);
 
 			if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
 				sched_numa_warn("Invalid distance value range");
+				bitmap_free(distance_map);
 				return;
 			}
 
@@ -1795,16 +1820,17 @@ void sched_init_numa(void)
 	 */
 	nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
 
-	sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
-	if (!sched_domains_numa_distance) {
+	distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+	if (!distances) {
 		bitmap_free(distance_map);
 		return;
 	}
 
 	for (i = 0, j = 0; i < nr_levels; i++, j++) {
 		j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
-		sched_domains_numa_distance[i] = j;
+		distances[i] = j;
 	}
+	rcu_assign_pointer(sched_domains_numa_distance, distances);
 
 	bitmap_free(distance_map);
 
@@ -1826,8 +1852,8 @@ void sched_init_numa(void)
 	 */
 	sched_domains_numa_levels = 0;
 
-	sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
-	if (!sched_domains_numa_masks)
+	masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+	if (!masks)
 		return;
 
 	/*
@@ -1835,31 +1861,20 @@ void sched_init_numa(void)
 	 * CPUs of nodes that are that many hops away from us.
 	 */
 	for (i = 0; i < nr_levels; i++) {
-		sched_domains_numa_masks[i] =
-			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
-		if (!sched_domains_numa_masks[i])
+		masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!masks[i])
 			return;
 
-		for (j = 0; j < nr_node_ids; j++) {
+		for_each_cpu_node_but(j, offline_node) {
 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
 			int k;
 
 			if (!mask)
 				return;
 
-			sched_domains_numa_masks[i][j] = mask;
-
-			for_each_node(k) {
-				/*
-				 * Distance information can be unreliable for
-				 * offline nodes, defer building the node
-				 * masks to its bringup.
-				 * This relies on all unique distance values
-				 * still being visible at init time.
-				 */
-				if (!node_online(j))
-					continue;
+			masks[i][j] = mask;
 
+			for_each_cpu_node_but(k, offline_node) {
 				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
 					sched_numa_warn("Node-distance not symmetric");
 
@@ -1870,6 +1885,7 @@ void sched_init_numa(void)
 			}
 		}
 	}
+	rcu_assign_pointer(sched_domains_numa_masks, masks);
 
 	/* Compute default topology size */
 	for (i = 0; sched_domain_topology[i].mask; i++);
@@ -1907,59 +1923,67 @@ void sched_init_numa(void)
 		};
 	}
 
+	sched_domain_topology_saved = sched_domain_topology;
 	sched_domain_topology = tl;
 
 	sched_domains_numa_levels = nr_levels;
-	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
-
-	init_numa_topology_type();
-
-	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
-	if (!sched_numa_onlined_nodes)
-		return;
+	WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
 
-	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
-	for_each_online_node(i)
-		bitmap_set(sched_numa_onlined_nodes, i, 1);
+	init_numa_topology_type(offline_node);
 }
 
-static void __sched_domains_numa_masks_set(unsigned int node)
-{
-	int i, j;
-
-	/*
-	 * NUMA masks are not built for offline nodes in sched_init_numa().
-	 * Thus, when a CPU of a never-onlined-before node gets plugged in,
-	 * adding that new CPU to the right NUMA masks is not sufficient: the
-	 * masks of that CPU's node must also be updated.
-	 */
-	if (test_bit(node, sched_numa_onlined_nodes))
-		return;
 
-	bitmap_set(sched_numa_onlined_nodes, node, 1);
-
-	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++) {
-			if (!node_online(j) || node == j)
-				continue;
+static void sched_reset_numa(void)
+{
+	int nr_levels, *distances;
+	struct cpumask ***masks;
 
-			if (node_distance(j, node) > sched_domains_numa_distance[i])
+	nr_levels = sched_domains_numa_levels;
+	sched_domains_numa_levels = 0;
+	sched_max_numa_distance = 0;
+	sched_numa_topology_type = NUMA_DIRECT;
+	distances = sched_domains_numa_distance;
+	rcu_assign_pointer(sched_domains_numa_distance, NULL);
+	masks = sched_domains_numa_masks;
+	rcu_assign_pointer(sched_domains_numa_masks, NULL);
+	if (distances || masks) {
+		int i, j;
+
+		synchronize_rcu();
+		kfree(distances);
+		for (i = 0; i < nr_levels && masks; i++) {
+			if (!masks[i])
 				continue;
-
-			/* Add remote nodes in our masks */
-			cpumask_or(sched_domains_numa_masks[i][node],
-				   sched_domains_numa_masks[i][node],
-				   sched_domains_numa_masks[0][j]);
+			for_each_node(j)
+				kfree(masks[i][j]);
+			kfree(masks[i]);
 		}
+		kfree(masks);
 	}
+	if (sched_domain_topology_saved) {
+		kfree(sched_domain_topology);
+		sched_domain_topology = sched_domain_topology_saved;
+		sched_domain_topology_saved = NULL;
+	}
+}
+
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+	int node;
 
+	node = cpu_to_node(cpu);
 	/*
-	 * A new node has been brought up, potentially changing the topology
-	 * classification.
-	 *
-	 * Note that this is racy vs any use of sched_numa_topology_type :/
+	 * Scheduler NUMA topology is updated when the first CPU of a
+	 * node is onlined or the last CPU of a node is offlined.
 	 */
-	init_numa_topology_type();
+	if (cpumask_weight(cpumask_of_node(node)) != 1)
+		return;
+
+	sched_reset_numa();
+	sched_init_numa(online ? NUMA_NO_NODE : node);
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1967,11 +1991,9 @@ void sched_domains_numa_masks_set(unsigned int cpu)
 	int node = cpu_to_node(cpu);
 	int i, j;
 
-	__sched_domains_numa_masks_set(node);
-
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
-			if (!node_online(j))
+			if (!node_state(j, N_CPU))
 				continue;
 
 			/* Set ourselves in the remote node's masks */
@@ -1986,8 +2008,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
 	int i, j;
 
 	for (i = 0; i < sched_domains_numa_levels; i++) {
-		for (j = 0; j < nr_node_ids; j++)
-			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+		for (j = 0; j < nr_node_ids; j++) {
+			if (sched_domains_numa_masks[i][j])
+				cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+		}
 	}
 }
 
@@ -2001,14 +2025,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
  */
 int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 {
-	int i, j = cpu_to_node(cpu);
+	int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+	struct cpumask ***masks;
 
+	rcu_read_lock();
+	masks = rcu_dereference(sched_domains_numa_masks);
+	if (!masks)
+		goto unlock;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
-		cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
-		if (cpu < nr_cpu_ids)
-			return cpu;
+		if (!masks[i][j])
+			break;
+		cpu = cpumask_any_and(cpus, masks[i][j]);
+		if (cpu < nr_cpu_ids) {
+			found = cpu;
+			break;
+		}
 	}
-	return nr_cpu_ids;
+unlock:
+	rcu_read_unlock();
+
+	return found;
 }
 
 #endif /* CONFIG_NUMA */
-- 
cgit v1.2.3


From 5c7b1aaf139dab5072311853bacc40fc3457d1f9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 14 Feb 2022 20:15:53 +0800
Subject: sched/numa: Avoid migrating task to CPU-less node

In a typical memory tiering system, there's no CPU in slow (PMEM) NUMA
nodes.  But if the number of the hint page faults on a PMEM node is
the max for a task, The current NUMA balancing policy may try to place
the task on the PMEM node instead of DRAM node.  This is unreasonable,
because there's no CPU in PMEM NUMA nodes.  To fix this, CPU-less
nodes are ignored when searching the migration target node for a task
in this patch.

To test the patch, we run a workload that accesses more memory in PMEM
node than memory in DRAM node.  Without the patch, the PMEM node will
be chosen as preferred node in task_numa_placement().  While the DRAM
node will be chosen instead with the patch.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220214121553.582248-2-ying.huang@intel.com
---
 kernel/sched/fair.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3230b84250..11a72e1b3b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1989,7 +1989,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 */
 	ng = deref_curr_numa_group(p);
 	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
-		for_each_online_node(nid) {
+		for_each_node_state(nid, N_CPU) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
 
@@ -2087,13 +2087,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
 	unsigned long faults, max_faults = 0;
 	int nid, active_nodes = 0;
 
-	for_each_online_node(nid) {
+	for_each_node_state(nid, N_CPU) {
 		faults = group_faults_cpu(numa_group, nid);
 		if (faults > max_faults)
 			max_faults = faults;
 	}
 
-	for_each_online_node(nid) {
+	for_each_node_state(nid, N_CPU) {
 		faults = group_faults_cpu(numa_group, nid);
 		if (faults * ACTIVE_NODE_FRACTION > max_faults)
 			active_nodes++;
@@ -2247,7 +2247,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 
 		dist = sched_max_numa_distance;
 
-		for_each_online_node(node) {
+		for_each_node_state(node, N_CPU) {
 			score = group_weight(p, node, dist);
 			if (score > max_score) {
 				max_score = score;
@@ -2266,7 +2266,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 	 * inside the highest scoring group of nodes. The nodemask tricks
 	 * keep the complexity of the search down.
 	 */
-	nodes = node_online_map;
+	nodes = node_states[N_CPU];
 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
 		unsigned long max_faults = 0;
 		nodemask_t max_group = NODE_MASK_NONE;
@@ -2405,6 +2405,21 @@ static void task_numa_placement(struct task_struct *p)
 		}
 	}
 
+	/* Cannot migrate task to CPU-less node */
+	if (!node_state(max_nid, N_CPU)) {
+		int near_nid = max_nid;
+		int distance, near_distance = INT_MAX;
+
+		for_each_node_state(nid, N_CPU) {
+			distance = node_distance(max_nid, nid);
+			if (distance < near_distance) {
+				near_nid = nid;
+				near_distance = distance;
+			}
+		}
+		max_nid = near_nid;
+	}
+
 	if (ng) {
 		numa_group_count_active_nodes(ng);
 		spin_unlock_irq(group_lock);
-- 
cgit v1.2.3


From e6df4ead85d9da1b07dd40bd4c6d2182f3e210c4 Mon Sep 17 00:00:00 2001
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Date: Tue, 25 Jan 2022 14:56:58 +0800
Subject: psi: fix possible trigger missing in the window

When a new threshold breaching stall happens after a psi event was
generated and within the window duration, the new event is not
generated because the events are rate-limited to one per window. If
after that no new stall is recorded then the event will not be
generated even after rate-limiting duration has passed. This is
happening because with no new stall, window_update will not be called
even though threshold was previously breached. To fix this, record
threshold breaching occurrence and generate the event once window
duration is passed.

Suggested-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Link: https://lore.kernel.org/r/1643093818-19835-1-git-send-email-huangzhaoyang@gmail.com
---
 kernel/sched/psi.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cfe76f704d8a..e9d623cb8d1b 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -523,7 +523,7 @@ static void init_triggers(struct psi_group *group, u64 now)
 static u64 update_triggers(struct psi_group *group, u64 now)
 {
 	struct psi_trigger *t;
-	bool new_stall = false;
+	bool update_total = false;
 	u64 *total = group->total[PSI_POLL];
 
 	/*
@@ -532,24 +532,35 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	 */
 	list_for_each_entry(t, &group->triggers, node) {
 		u64 growth;
+		bool new_stall;
 
-		/* Check for stall activity */
-		if (group->polling_total[t->state] == total[t->state])
-			continue;
+		new_stall = group->polling_total[t->state] != total[t->state];
 
+		/* Check for stall activity or a previous threshold breach */
+		if (!new_stall && !t->pending_event)
+			continue;
 		/*
-		 * Multiple triggers might be looking at the same state,
-		 * remember to update group->polling_total[] once we've
-		 * been through all of them. Also remember to extend the
-		 * polling time if we see new stall activity.
+		 * Check for new stall activity, as well as deferred
+		 * events that occurred in the last window after the
+		 * trigger had already fired (we want to ratelimit
+		 * events without dropping any).
 		 */
-		new_stall = true;
-
-		/* Calculate growth since last update */
-		growth = window_update(&t->win, now, total[t->state]);
-		if (growth < t->threshold)
-			continue;
-
+		if (new_stall) {
+			/*
+			 * Multiple triggers might be looking at the same state,
+			 * remember to update group->polling_total[] once we've
+			 * been through all of them. Also remember to extend the
+			 * polling time if we see new stall activity.
+			 */
+			update_total = true;
+
+			/* Calculate growth since last update */
+			growth = window_update(&t->win, now, total[t->state]);
+			if (growth < t->threshold)
+				continue;
+
+			t->pending_event = true;
+		}
 		/* Limit event signaling to once per window */
 		if (now < t->last_event_time + t->win.size)
 			continue;
@@ -558,9 +569,11 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 		if (cmpxchg(&t->event, 0, 1) == 0)
 			wake_up_interruptible(&t->event_wait);
 		t->last_event_time = now;
+		/* Reset threshold breach flag once event got generated */
+		t->pending_event = false;
 	}
 
-	if (new_stall)
+	if (update_total)
 		memcpy(group->polling_total, total,
 				sizeof(group->polling_total));
 
@@ -1125,6 +1138,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->last_event_time = 0;
 	init_waitqueue_head(&t->event_wait);
 	kref_init(&t->refcount);
+	t->pending_event = false;
 
 	mutex_lock(&group->trigger_lock);
 
-- 
cgit v1.2.3


From 7b45b51e778021cd7817b8f0d743a2c73205c011 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:04 +0100
Subject: workqueue: Decouple HK_FLAG_WQ and HK_FLAG_DOMAIN cpumask fetch

To prepare for supporting each feature of the housekeeping cpumask
toward cpuset, prepare each of the HK_FLAG_* entries to move to their
own cpumask with enforcing to fetch them individually. The new
constraint is that multiple HK_FLAG_* entries can't be mixed together
anymore in a single call to housekeeping cpumask().

This will later allow, for example, to runtime modify the cpulist passed
through "isolcpus=", "nohz_full=" and "rcu_nocbs=" kernel boot
parameters.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220207155910.527133-3-frederic@kernel.org
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33f1106b4f99..61ed310621ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6006,13 +6006,13 @@ static void __init wq_numa_init(void)
 void __init workqueue_init_early(void)
 {
 	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
-	int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
 	int i, cpu;
 
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
+	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_WQ));
+	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 04d4e665a60902cf36e7ad39af1179cb5df542ad Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:06 +0100
Subject: sched/isolation: Use single feature type while referring to
 housekeeping cpumask

Refer to housekeeping APIs using single feature types instead of flags.
This prevents from passing multiple isolation features at once to
housekeeping interfaces, which soon won't be possible anymore as each
isolation features will have their own cpumask.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220207155910.527133-5-frederic@kernel.org
---
 kernel/cgroup/cpuset.c   |  6 +++---
 kernel/cpu.c             |  4 ++--
 kernel/irq/cpuhotplug.c  |  4 ++--
 kernel/irq/manage.c      |  4 ++--
 kernel/kthread.c         |  4 ++--
 kernel/rcu/tasks.h       |  2 +-
 kernel/rcu/tree_plugin.h |  6 +++---
 kernel/sched/core.c      | 12 ++++++------
 kernel/sched/fair.c      | 10 +++++-----
 kernel/sched/isolation.c | 32 ++++++++++++++++++++++----------
 kernel/sched/topology.c  |  8 ++++----
 kernel/watchdog.c        |  2 +-
 kernel/workqueue.c       |  4 ++--
 13 files changed, 55 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dc653ab26e50..e4e18a2cb404 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -803,7 +803,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-			    housekeeping_cpumask(HK_FLAG_DOMAIN));
+			    housekeeping_cpumask(HK_TYPE_DOMAIN));
 
 		goto done;
 	}
@@ -833,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
 		      cpumask_intersects(cp->cpus_allowed,
-					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
+					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
 			continue;
 
 		if (root_load_balance &&
@@ -922,7 +922,7 @@ restart:
 
 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..f39eb0b52dfe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1488,8 +1488,8 @@ int freeze_secondary_cpus(int primary)
 	cpu_maps_update_begin();
 	if (primary == -1) {
 		primary = cpumask_first(cpu_online_mask);
-		if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
-			primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+		if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+			primary = housekeeping_any_cpu(HK_TYPE_TIMER);
 	} else {
 		if (!cpu_online(primary))
 			primary = cpumask_first(cpu_online_mask);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 39a41c56ad4f..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
 {
 	const struct cpumask *hk_mask;
 
-	if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+	if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
 		return false;
 
-	hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+	hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
 	if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
 		return false;
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f23ffd30385b..c03f71d5ec10 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	 * online.
 	 */
 	if (irqd_affinity_is_managed(data) &&
-	    housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+	    housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
 		const struct cpumask *hk_mask, *prog_mask;
 
 		static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
 		static struct cpumask tmp_mask;
 
-		hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+		hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
 
 		raw_spin_lock(&tmp_mask_lock);
 		cpumask_and(&tmp_mask, mask, hk_mask);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 38c6dd822da8..d100d5a15b38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -356,7 +356,7 @@ static int kthread(void *_create)
 	 * back to default in case they have been changed.
 	 */
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
-	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -722,7 +722,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
-	set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
+	set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 84f1d91604cc..6093b200dff7 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -492,7 +492,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	struct rcu_tasks *rtp = arg;
 
 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current, HK_FLAG_RCU);
+	housekeeping_affine(current, HK_TYPE_RCU);
 	WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
 
 	/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..65f25a32f6d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1214,9 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
 		    cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
-	cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
+	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
 	if (cpumask_weight(cm) == 0)
-		cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
+		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
 	set_cpus_allowed_ptr(t, cm);
 	free_cpumask_var(cm);
 }
@@ -1291,7 +1291,7 @@ static void rcu_bind_gp_kthread(void)
 {
 	if (!tick_nohz_full_enabled())
 		return;
-	housekeeping_affine(current, HK_FLAG_RCU);
+	housekeeping_affine(current, HK_TYPE_RCU);
 }
 
 /* Record the current task on dyntick-idle entry. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b2226922206d..1e08b02e0cd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1025,13 +1025,13 @@ int get_nohz_timer_target(void)
 	struct sched_domain *sd;
 	const struct cpumask *hk_mask;
 
-	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+	if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
 		if (!idle_cpu(cpu))
 			return cpu;
 		default_cpu = cpu;
 	}
 
-	hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+	hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
@@ -1047,7 +1047,7 @@ int get_nohz_timer_target(void)
 	}
 
 	if (default_cpu == -1)
-		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+		default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
 	cpu = default_cpu;
 unlock:
 	rcu_read_unlock();
@@ -5371,7 +5371,7 @@ static void sched_tick_start(int cpu)
 	int os;
 	struct tick_work *twork;
 
-	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
 		return;
 
 	WARN_ON_ONCE(!tick_work_cpu);
@@ -5392,7 +5392,7 @@ static void sched_tick_stop(int cpu)
 	struct tick_work *twork;
 	int os;
 
-	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
 		return;
 
 	WARN_ON_ONCE(!tick_work_cpu);
@@ -9251,7 +9251,7 @@ void __init sched_init_smp(void)
 	mutex_unlock(&sched_domains_mutex);
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
 		BUG();
 	current->flags &= ~PF_NO_SETAFFINITY;
 	sched_init_granularity();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11a72e1b3b2c..dcbd3110c687 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10337,7 +10337,7 @@ static inline int on_null_domain(struct rq *rq)
  * - When one of the busy CPUs notice that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
  *   anywhere yet.
  */
 
@@ -10346,7 +10346,7 @@ static inline int find_new_ilb(void)
 	int ilb;
 	const struct cpumask *hk_mask;
 
-	hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
+	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
 
 	for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
 
@@ -10362,7 +10362,7 @@ static inline int find_new_ilb(void)
 
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
  */
 static void kick_ilb(unsigned int flags)
 {
@@ -10575,7 +10575,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+	if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
 		return;
 
 	/*
@@ -10791,7 +10791,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
 	 * This CPU doesn't want to be disturbed by scheduler
 	 * housekeeping
 	 */
-	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+	if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
 		return;
 
 	/* Will wake up very soon. No time for doing anything else*/
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 7f06eaf12818..a735d9e229dd 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -9,23 +9,35 @@
  */
 #include "sched.h"
 
+enum hk_flags {
+	HK_FLAG_TIMER		= BIT(HK_TYPE_TIMER),
+	HK_FLAG_RCU		= BIT(HK_TYPE_RCU),
+	HK_FLAG_MISC		= BIT(HK_TYPE_MISC),
+	HK_FLAG_SCHED		= BIT(HK_TYPE_SCHED),
+	HK_FLAG_TICK		= BIT(HK_TYPE_TICK),
+	HK_FLAG_DOMAIN		= BIT(HK_TYPE_DOMAIN),
+	HK_FLAG_WQ		= BIT(HK_TYPE_WQ),
+	HK_FLAG_MANAGED_IRQ	= BIT(HK_TYPE_MANAGED_IRQ),
+	HK_FLAG_KTHREAD		= BIT(HK_TYPE_KTHREAD),
+};
+
 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
 EXPORT_SYMBOL_GPL(housekeeping_overridden);
 static cpumask_var_t housekeeping_mask;
 static unsigned int housekeeping_flags;
 
-bool housekeeping_enabled(enum hk_flags flags)
+bool housekeeping_enabled(enum hk_type type)
 {
-	return !!(housekeeping_flags & flags);
+	return !!(housekeeping_flags & BIT(type));
 }
 EXPORT_SYMBOL_GPL(housekeeping_enabled);
 
-int housekeeping_any_cpu(enum hk_flags flags)
+int housekeeping_any_cpu(enum hk_type type)
 {
 	int cpu;
 
 	if (static_branch_unlikely(&housekeeping_overridden)) {
-		if (housekeeping_flags & flags) {
+		if (housekeeping_flags & BIT(type)) {
 			cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
 			if (cpu < nr_cpu_ids)
 				return cpu;
@@ -37,27 +49,27 @@ int housekeeping_any_cpu(enum hk_flags flags)
 }
 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
-const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & flags)
+		if (housekeeping_flags & BIT(type))
 			return housekeeping_mask;
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
-void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & flags)
+		if (housekeeping_flags & BIT(type))
 			set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
-bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & flags)
+		if (housekeeping_flags & BIT(type))
 			return cpumask_test_cpu(cpu, housekeeping_mask);
 	return true;
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5db322c9cb3f..32841c6741d1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1366,7 +1366,7 @@ static void asym_cpu_capacity_scan(void)
 	list_for_each_entry(entry, &asym_cap_list, link)
 		cpumask_clear(cpu_capacity_span(entry));
 
-	for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
+	for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
 		asym_cpu_capacity_update_data(cpu);
 
 	list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
@@ -2440,7 +2440,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
 	err = build_sched_domains(doms_cur[0], NULL);
 
 	return err;
@@ -2529,7 +2529,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
 		if (doms_new) {
 			n = 1;
 			cpumask_and(doms_new[0], cpu_active_mask,
-				    housekeeping_cpumask(HK_FLAG_DOMAIN));
+				    housekeeping_cpumask(HK_TYPE_DOMAIN));
 		}
 	} else {
 		n = ndoms_new;
@@ -2564,7 +2564,7 @@ match1:
 		n = 0;
 		doms_new = &fallback_doms;
 		cpumask_and(doms_new[0], cpu_active_mask,
-			    housekeeping_cpumask(HK_FLAG_DOMAIN));
+			    housekeeping_cpumask(HK_TYPE_DOMAIN));
 	}
 
 	/* Build new domains: */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 99afb88d2e85..9166220457bc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -848,7 +848,7 @@ void __init lockup_detector_init(void)
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
 
 	cpumask_copy(&watchdog_cpumask,
-		     housekeeping_cpumask(HK_FLAG_TIMER));
+		     housekeeping_cpumask(HK_TYPE_TIMER));
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61ed310621ea..52e9abbb7759 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6011,8 +6011,8 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_WQ));
-	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
+	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
+	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 6367b600e31c6b211eadee7bf7f4ed8c755e9176 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:07 +0100
Subject: sched/isolation: Consolidate check for housekeeping minimum service

There can be two subsequent calls to housekeeping_setup() due to
"nohz_full=" and "isolcpus=" that can mix up.  The two passes each have
their own way to deal with an empty housekeeping set of CPUs.
Consolidate this part and remove the awful "tmp" based naming.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220207155910.527133-6-frederic@kernel.org
---
 kernel/sched/isolation.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index a735d9e229dd..23085e665faa 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -91,8 +91,7 @@ void __init housekeeping_init(void)
 
 static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
-	cpumask_var_t non_housekeeping_mask;
-	cpumask_var_t tmp;
+	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
 	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
@@ -101,32 +100,32 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 		return 0;
 	}
 
-	alloc_bootmem_cpumask_var(&tmp);
-	if (!housekeeping_flags) {
-		alloc_bootmem_cpumask_var(&housekeeping_mask);
-		cpumask_andnot(housekeeping_mask,
-			       cpu_possible_mask, non_housekeeping_mask);
+	alloc_bootmem_cpumask_var(&housekeeping_staging);
+	cpumask_andnot(housekeeping_staging,
+		       cpu_possible_mask, non_housekeeping_mask);
 
-		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
-		if (cpumask_empty(tmp)) {
+	if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
+		__cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+		__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+		if (!housekeeping_flags) {
 			pr_warn("Housekeeping: must include one present CPU, "
 				"using boot CPU:%d\n", smp_processor_id());
-			__cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
-			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
 		}
+	}
+
+	if (!housekeeping_flags) {
+		alloc_bootmem_cpumask_var(&housekeeping_mask);
+		cpumask_copy(housekeeping_mask, housekeeping_staging);
 	} else {
-		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
-		if (cpumask_empty(tmp))
-			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
-		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
-		if (!cpumask_equal(tmp, housekeeping_mask)) {
+		if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) {
 			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
-			free_bootmem_cpumask_var(tmp);
+			free_bootmem_cpumask_var(housekeeping_staging);
 			free_bootmem_cpumask_var(non_housekeeping_mask);
 			return 0;
 		}
 	}
-	free_bootmem_cpumask_var(tmp);
+
+	free_bootmem_cpumask_var(housekeeping_staging);
 
 	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
 		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
-- 
cgit v1.2.3


From 0cd3e59de1f53978873669c7c8225ec13e88c3ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:08 +0100
Subject: sched/isolation: Consolidate error handling

Centralize the mask freeing and return value for the error path. This
makes potential leaks more visible.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220207155910.527133-7-frederic@kernel.org
---
 kernel/sched/isolation.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 23085e665faa..828dacec483e 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -92,12 +92,12 @@ void __init housekeeping_init(void)
 static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+	int err = 0;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
 	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
 		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
-		free_bootmem_cpumask_var(non_housekeeping_mask);
-		return 0;
+		goto free_non_housekeeping_mask;
 	}
 
 	alloc_bootmem_cpumask_var(&housekeeping_staging);
@@ -119,30 +119,29 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 	} else {
 		if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) {
 			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
-			free_bootmem_cpumask_var(housekeeping_staging);
-			free_bootmem_cpumask_var(non_housekeeping_mask);
-			return 0;
+			goto free_housekeeping_staging;
 		}
 	}
 
-	free_bootmem_cpumask_var(housekeeping_staging);
-
 	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
 		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
 			tick_nohz_full_setup(non_housekeeping_mask);
 		} else {
 			pr_warn("Housekeeping: nohz unsupported."
 				" Build with CONFIG_NO_HZ_FULL\n");
-			free_bootmem_cpumask_var(non_housekeeping_mask);
-			return 0;
+			goto free_housekeeping_staging;
 		}
 	}
 
 	housekeeping_flags |= flags;
+	err = 1;
 
+free_housekeeping_staging:
+	free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
 	free_bootmem_cpumask_var(non_housekeeping_mask);
 
-	return 1;
+	return err;
 }
 
 static int __init housekeeping_nohz_full_setup(char *str)
-- 
cgit v1.2.3


From 65e53f869e9f92a23593c66214b88e54fb190a13 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:09 +0100
Subject: sched/isolation: Fix housekeeping_mask memory leak

If "nohz_full=" or "isolcpus=nohz" are called with CONFIG_NO_HZ_FULL=n,
housekeeping_mask doesn't get freed despite it being unused if
housekeeping_setup() is called for the first time.

Check this scenario first to fix this, so that no useless allocation
is performed.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220207155910.527133-8-frederic@kernel.org
---
 kernel/sched/isolation.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 828dacec483e..883eee9fae22 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -94,6 +94,14 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
 	int err = 0;
 
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+		if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+			pr_warn("Housekeeping: nohz unsupported."
+				" Build with CONFIG_NO_HZ_FULL\n");
+			return 0;
+		}
+	}
+
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
 	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
 		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
@@ -123,15 +131,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 		}
 	}
 
-	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
-		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
-			tick_nohz_full_setup(non_housekeeping_mask);
-		} else {
-			pr_warn("Housekeeping: nohz unsupported."
-				" Build with CONFIG_NO_HZ_FULL\n");
-			goto free_housekeeping_staging;
-		}
-	}
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK))
+		tick_nohz_full_setup(non_housekeeping_mask);
 
 	housekeeping_flags |= flags;
 	err = 1;
-- 
cgit v1.2.3


From ed3b362d54f0038cafc985248350d301af7af686 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 7 Feb 2022 16:59:10 +0100
Subject: sched/isolation: Split housekeeping cpumask per isolation features

To prepare for supporting each housekeeping feature toward cpuset, split
the global housekeeping cpumask per HK_TYPE_* entry.

This will later allow, for example, to runtime modify the cpulist passed
through "isolcpus=", "nohz_full=" and "rcu_nocbs=" kernel boot
parameters.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Link: https://lore.kernel.org/r/20220207155910.527133-9-frederic@kernel.org
---
 kernel/sched/isolation.c | 91 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 883eee9fae22..b4d10815c45a 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -23,12 +23,17 @@ enum hk_flags {
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
 EXPORT_SYMBOL_GPL(housekeeping_overridden);
-static cpumask_var_t housekeeping_mask;
-static unsigned int housekeeping_flags;
+
+struct housekeeping {
+	cpumask_var_t cpumasks[HK_TYPE_MAX];
+	unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
 
 bool housekeeping_enabled(enum hk_type type)
 {
-	return !!(housekeeping_flags & BIT(type));
+	return !!(housekeeping.flags & BIT(type));
 }
 EXPORT_SYMBOL_GPL(housekeeping_enabled);
 
@@ -37,12 +42,12 @@ int housekeeping_any_cpu(enum hk_type type)
 	int cpu;
 
 	if (static_branch_unlikely(&housekeeping_overridden)) {
-		if (housekeeping_flags & BIT(type)) {
-			cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+		if (housekeeping.flags & BIT(type)) {
+			cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
 			if (cpu < nr_cpu_ids)
 				return cpu;
 
-			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+			return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
 		}
 	}
 	return smp_processor_id();
@@ -52,8 +57,8 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 const struct cpumask *housekeeping_cpumask(enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & BIT(type))
-			return housekeeping_mask;
+		if (housekeeping.flags & BIT(type))
+			return housekeeping.cpumasks[type];
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
@@ -61,40 +66,53 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 void housekeeping_affine(struct task_struct *t, enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & BIT(type))
-			set_cpus_allowed_ptr(t, housekeeping_mask);
+		if (housekeeping.flags & BIT(type))
+			set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu, enum hk_type type)
 {
 	if (static_branch_unlikely(&housekeeping_overridden))
-		if (housekeeping_flags & BIT(type))
-			return cpumask_test_cpu(cpu, housekeeping_mask);
+		if (housekeeping.flags & BIT(type))
+			return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
 	return true;
 }
 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-	if (!housekeeping_flags)
+	enum hk_type type;
+
+	if (!housekeeping.flags)
 		return;
 
 	static_branch_enable(&housekeeping_overridden);
 
-	if (housekeeping_flags & HK_FLAG_TICK)
+	if (housekeeping.flags & HK_FLAG_TICK)
 		sched_tick_offload_init();
 
-	/* We need at least one CPU to handle housekeeping work */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+	for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+		/* We need at least one CPU to handle housekeeping work */
+		WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+	}
+}
+
+static void __init housekeeping_setup_type(enum hk_type type,
+					   cpumask_var_t housekeeping_staging)
+{
+
+	alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+	cpumask_copy(housekeeping.cpumasks[type],
+		     housekeeping_staging);
 }
 
-static int __init housekeeping_setup(char *str, enum hk_flags flags)
+static int __init housekeeping_setup(char *str, unsigned long flags)
 {
 	cpumask_var_t non_housekeeping_mask, housekeeping_staging;
 	int err = 0;
 
-	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+	if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
 		if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
 			pr_warn("Housekeeping: nohz unsupported."
 				" Build with CONFIG_NO_HZ_FULL\n");
@@ -115,26 +133,41 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
 	if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
 		__cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
 		__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
-		if (!housekeeping_flags) {
+		if (!housekeeping.flags) {
 			pr_warn("Housekeeping: must include one present CPU, "
 				"using boot CPU:%d\n", smp_processor_id());
 		}
 	}
 
-	if (!housekeeping_flags) {
-		alloc_bootmem_cpumask_var(&housekeeping_mask);
-		cpumask_copy(housekeeping_mask, housekeeping_staging);
+	if (!housekeeping.flags) {
+		/* First setup call ("nohz_full=" or "isolcpus=") */
+		enum hk_type type;
+
+		for_each_set_bit(type, &flags, HK_TYPE_MAX)
+			housekeeping_setup_type(type, housekeeping_staging);
 	} else {
-		if (!cpumask_equal(housekeeping_staging, housekeeping_mask)) {
-			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
-			goto free_housekeeping_staging;
+		/* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+		enum hk_type type;
+		unsigned long iter_flags = flags & housekeeping.flags;
+
+		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+			if (!cpumask_equal(housekeeping_staging,
+					   housekeeping.cpumasks[type])) {
+				pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+				goto free_housekeeping_staging;
+			}
 		}
+
+		iter_flags = flags & ~housekeeping.flags;
+
+		for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+			housekeeping_setup_type(type, housekeeping_staging);
 	}
 
-	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK))
+	if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
 		tick_nohz_full_setup(non_housekeeping_mask);
 
-	housekeeping_flags |= flags;
+	housekeeping.flags |= flags;
 	err = 1;
 
 free_housekeeping_staging:
@@ -147,7 +180,7 @@ free_non_housekeeping_mask:
 
 static int __init housekeeping_nohz_full_setup(char *str)
 {
-	unsigned int flags;
+	unsigned long flags;
 
 	flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
 		HK_FLAG_MISC | HK_FLAG_KTHREAD;
@@ -158,7 +191,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
 
 static int __init housekeeping_isolcpus_setup(char *str)
 {
-	unsigned int flags = 0;
+	unsigned long flags = 0;
 	bool illegal = false;
 	char *par;
 	int len;
-- 
cgit v1.2.3


From fb7275acd6fb988313dddd8d3d19efa70d9015ad Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 10 Feb 2022 22:55:26 -0500
Subject: locking/lockdep: Iterate lock_classes directly when reading lockdep
 files

When dumping lock_classes information via /proc/lockdep, we can't take
the lockdep lock as the lock hold time is indeterminate. Iterating
over all_lock_classes without holding lock can be dangerous as there
is a slight chance that it may branch off to other lists leading to
infinite loop or even access invalid memory if changes are made to
all_lock_classes list in parallel.

To avoid this problem, iteration of lock classes is now done directly
on the lock_classes array itself. The lock_classes_in_use bitmap is
checked to see if the lock class is being used. To avoid iterating
the full array all the times, a new max_lock_class_idx value is added
to track the maximum lock_class index that is currently being used.

We can theoretically take the lockdep lock for iterating all_lock_classes
when other lockdep files (lockdep_stats and lock_stat) are accessed as
the lock hold time will be shorter for them. For consistency, they are
also modified to iterate the lock_classes array directly.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220211035526.1329503-2-longman@redhat.com
---
 kernel/locking/lockdep.c           | 14 +++++++----
 kernel/locking/lockdep_internals.h |  6 +++--
 kernel/locking/lockdep_proc.c      | 51 ++++++++++++++++++++++++++++++++------
 3 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2e6892ec3756..50036c10b518 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -183,11 +183,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
 static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
 unsigned long nr_lock_classes;
 unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
 struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
 
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
@@ -338,7 +336,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
  * elements. These elements are linked together by the lock_entry member in
  * struct lock_class.
  */
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
 static LIST_HEAD(free_lock_classes);
 
 /**
@@ -1252,6 +1250,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	struct lockdep_subclass_key *key;
 	struct hlist_head *hash_head;
 	struct lock_class *class;
+	int idx;
 
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
 
@@ -1317,6 +1316,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * of classes.
 	 */
 	list_move_tail(&class->lock_entry, &all_lock_classes);
+	idx = class - lock_classes;
+	if (idx > max_lock_class_idx)
+		max_lock_class_idx = idx;
 
 	if (verbose(class)) {
 		graph_unlock();
@@ -6000,6 +6002,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
 		WRITE_ONCE(class->name, NULL);
 		nr_lock_classes--;
 		__clear_bit(class - lock_classes, lock_classes_in_use);
+		if (class - lock_classes == max_lock_class_idx)
+			max_lock_class_idx--;
 	} else {
 		WARN_ONCE(true, "%s() failed for class %s\n", __func__,
 			  class->name);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index ecb8662e7a4e..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
 
 #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 
-extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
 
 #define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
 
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
 
 #ifdef CONFIG_PROVE_LOCKING
 extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +208,6 @@ struct lockdep_stats {
 };
 
 DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
 #define __debug_atomic_inc(ptr)					\
 	this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b8d9a050c337..15fdc7fa5c68 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
 
 #include "lockdep_internals.h"
 
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class)				\
+	for (idx = 0, class = lock_classes; idx <= max_lock_class_idx;	\
+	     idx++, class++)
+
 static void *l_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	return seq_list_next(v, &all_lock_classes, pos);
+	struct lock_class *class = v;
+
+	++class;
+	*pos = class - lock_classes;
+	return (*pos > max_lock_class_idx) ? NULL : class;
 }
 
 static void *l_start(struct seq_file *m, loff_t *pos)
 {
-	return seq_list_start_head(&all_lock_classes, *pos);
+	unsigned long idx = *pos;
+
+	if (idx > max_lock_class_idx)
+		return NULL;
+	return lock_classes + idx;
 }
 
 static void l_stop(struct seq_file *m, void *v)
@@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class)
 
 static int l_show(struct seq_file *m, void *v)
 {
-	struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+	struct lock_class *class = v;
 	struct lock_list *entry;
 	char usage[LOCK_USAGE_CHARS];
+	int idx = class - lock_classes;
 
-	if (v == &all_lock_classes) {
+	if (v == lock_classes)
 		seq_printf(m, "all lock classes:\n");
+
+	if (!test_bit(idx, lock_classes_in_use))
 		return 0;
-	}
 
 	seq_printf(m, "%p", class->key);
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_PROVE_LOCKING
 	struct lock_class *class;
+	unsigned long idx;
 
-	list_for_each_entry(class, &all_lock_classes, lock_entry) {
+	iterate_lock_classes(idx, class) {
+		if (!test_bit(idx, lock_classes_in_use))
+			continue;
 
 		if (class->usage_mask == 0)
 			nr_unused++;
@@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 
 		sum_forward_deps += lockdep_count_forward_deps(class);
 	}
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 	DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
 #endif
@@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, " max bfs queue depth:           %11u\n",
 			max_bfs_queue_depth);
 #endif
+	seq_printf(m, " max lock class index:          %11lu\n",
+			max_lock_class_idx);
 	lockdep_stats_debug_show(m);
 	seq_printf(m, " debug_locks:                   %11u\n",
 			debug_locks);
@@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
 	if (!res) {
 		struct lock_stat_data *iter = data->stats;
 		struct seq_file *m = file->private_data;
+		unsigned long idx;
 
-		list_for_each_entry(class, &all_lock_classes, lock_entry) {
+		iterate_lock_classes(idx, class) {
+			if (!test_bit(idx, lock_classes_in_use))
+				continue;
 			iter->class = class;
 			iter->stats = lock_stats(class);
 			iter++;
 		}
+
 		data->iter_end = iter;
 
 		sort(data->stats, data->iter_end - data->stats,
@@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
 			       size_t count, loff_t *ppos)
 {
 	struct lock_class *class;
+	unsigned long idx;
 	char c;
 
 	if (count) {
@@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
 		if (c != '0')
 			return count;
 
-		list_for_each_entry(class, &all_lock_classes, lock_entry)
+		iterate_lock_classes(idx, class) {
+			if (!test_bit(idx, lock_classes_in_use))
+				continue;
 			clear_lock_stats(class);
+		}
 	}
 	return count;
 }
-- 
cgit v1.2.3


From adb8fa195efdfaac5852aaac24810b456ce43b04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mauricio=20V=C3=A1squez?= <mauricio@kinvolk.io>
Date: Tue, 15 Feb 2022 17:58:50 -0500
Subject: libbpf: Split bpf_core_apply_relo()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BTFGen needs to run the core relocation logic in order to understand
what are the types involved in a given relocation.

Currently bpf_core_apply_relo() calculates and **applies** a relocation
to an instruction. Having both operations in the same function makes it
difficult to only calculate the relocation without patching the
instruction. This commit splits that logic in two different phases: (1)
calculate the relocation and (2) patch the instruction.

For the first phase bpf_core_apply_relo() is renamed to
bpf_core_calc_relo_insn() who is now only on charge of calculating the
relocation, the second phase uses the already existing
bpf_core_patch_insn(). bpf_object__relocate_core() uses both of them and
the BTFGen will use only bpf_core_calc_relo_insn().

Signed-off-by: Mauricio Vásquez <mauricio@kinvolk.io>
Signed-off-by: Rafael David Tinoco <rafael.tinoco@aquasec.com>
Signed-off-by: Lorenzo Fontana <lorenzo.fontana@elastic.co>
Signed-off-by: Leonardo Di Donato <leonardo.didonato@elastic.co>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220215225856.671072-2-mauricio@kinvolk.io
---
 kernel/bpf/btf.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 11740b300de9..f1d3d2a2f5f6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7225,6 +7225,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
 {
 	bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
 	struct bpf_core_cand_list cands = {};
+	struct bpf_core_relo_res targ_res;
 	struct bpf_core_spec *specs;
 	int err;
 
@@ -7264,13 +7265,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
 		cands.len = cc->cnt;
 		/* cand_cache_mutex needs to span the cache lookup and
 		 * copy of btf pointer into bpf_core_cand_list,
-		 * since module can be unloaded while bpf_core_apply_relo_insn
+		 * since module can be unloaded while bpf_core_calc_relo_insn
 		 * is working with module's btf.
 		 */
 	}
 
-	err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
-				       relo, relo_idx, ctx->btf, &cands, specs);
+	err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+				      &targ_res);
+	if (err)
+		goto out;
+
+	err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+				  &targ_res);
+
 out:
 	kfree(specs);
 	if (need_cands) {
-- 
cgit v1.2.3


From 3f51aa9e296fe4af785d5761bb12556fb2494761 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Wed, 9 Feb 2022 19:29:51 +0800
Subject: PM: hibernate: fix load_image_and_restore() error path

As 'swsusp_check' open 'hib_resume_bdev', if call 'create_basic_memory_bitmaps'
failed, we need to close 'hib_resume_bdev' in 'load_image_and_restore' function.

Signed-off-by: Ye Bin <yebin10@huawei.com>
[ rjw: Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..49d1df0218cb 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -689,8 +689,10 @@ static int load_image_and_restore(void)
 
 	lock_device_hotplug();
 	error = create_basic_memory_bitmaps();
-	if (error)
+	if (error) {
+		swsusp_close(FMODE_READ | FMODE_EXCL);
 		goto Unlock;
+	}
 
 	error = swsusp_read(&flags);
 	swsusp_close(FMODE_READ | FMODE_EXCL);
-- 
cgit v1.2.3


From cea86fe246b694a191804b47378eb9d77aefabec Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 14 Feb 2022 18:26:39 -0800
Subject: mm/munlock: rmap call mlock_vma_page() munlock_vma_page()

Add vma argument to mlock_vma_page() and munlock_vma_page(), make them
inline functions which check (vma->vm_flags & VM_LOCKED) before calling
mlock_page() and munlock_page() in mm/mlock.c.

Add bool compound to mlock_vma_page() and munlock_vma_page(): this is
because we have understandable difficulty in accounting pte maps of THPs,
and if passed a PageHead page, mlock_page() and munlock_page() cannot
tell whether it's a pmd map to be counted or a pte map to be ignored.

Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the
others, and use that to call mlock_vma_page() at the end of the page
adds, and munlock_vma_page() at the end of page_remove_rmap() (end or
beginning? unimportant, but end was easier for assertions in testing).

No page lock is required (although almost all adds happen to hold it):
delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s.
Certainly page lock did serialize with page migration, but I'm having
difficulty explaining why that was ever important.

Mlock accounting on THPs has been hard to define, differed between anon
and file, involved PageDoubleMap in some places and not others, required
clear_page_mlock() at some points.  Keep it simple now: just count the
pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks.

page_add_new_anon_rmap() callers unchanged: they have long been calling
lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED
handling (it also checks for not VM_SPECIAL: I think that's overcautious,
and inconsistent with other checks, that mmap_region() already prevents
VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 kernel/events/uprobes.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07..eed2f7437d96 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 			return err;
 	}
 
-	/* For try_to_free_swap() and munlock_vma_page() below */
+	/* For try_to_free_swap() below */
 	lock_page(old_page);
 
 	mmu_notifier_invalidate_range_start(&range);
@@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		set_pte_at_notify(mm, addr, pvmw.pte,
 				  mk_pte(new_page, vma->vm_page_prot));
 
-	page_remove_rmap(old_page, false);
+	page_remove_rmap(old_page, vma, false);
 	if (!page_mapped(old_page))
 		try_to_free_swap(old_page);
 	page_vma_mapped_walk_done(&pvmw);
-
-	if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
-		munlock_vma_page(old_page);
 	put_page(old_page);
 
 	err = 0;
-- 
cgit v1.2.3


From d24d2a2b0a81dd5e9bb99aeb4559ec9734e1416f Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 17 Feb 2022 10:30:01 -0800
Subject: bpf: bpf_prog_pack: Set proper size before freeing ro_header

bpf_prog_pack_free() uses header->size to decide whether the header
should be freed with module_memfree() or the bpf_prog_pack logic.
However, in kvmalloc() failure path of bpf_jit_binary_pack_alloc(),
header->size is not set yet. As a result, bpf_prog_pack_free() may treat
a slice of a pack as a standalone kvmalloc'd header and call
module_memfree() on the whole pack. This in turn causes use-after-free by
other users of the pack.

Fix this by setting ro_header->size before freeing ro_header.

Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Reported-by: syzbot+2f649ec6d2eea1495a8f@syzkaller.appspotmail.com
Reported-by: syzbot+ecb1e7e51c52f68f7481@syzkaller.appspotmail.com
Reported-by: syzbot+87f65c75f4a72db05445@syzkaller.appspotmail.com
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220217183001.1876034-1-song@kernel.org
---
 kernel/bpf/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 44623c9b5bb1..ebb0193d07f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1069,6 +1069,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	*rw_header = kvmalloc(size, GFP_KERNEL);
 	if (!*rw_header) {
+		bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
 		bpf_prog_pack_free(ro_header);
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
-- 
cgit v1.2.3


From 5be2226f417d5b06d17e6c52d6e341cf43c29e48 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 15 Feb 2022 13:07:10 -0500
Subject: KVM: x86: allow defining return-0 static calls

A few vendor callbacks are only used by VMX, but they return an integer
or bool value.  Introduce KVM_X86_OP_OPTIONAL_RET0 for them: if a func is
NULL in struct kvm_x86_ops, it will be changed to __static_call_return0
when updating static calls.

Reviewed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/static_call.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..f2b8baea35d2 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -503,6 +503,7 @@ long __static_call_return0(void)
 {
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__static_call_return0);
 
 #ifdef CONFIG_STATIC_CALL_SELFTEST
 
-- 
cgit v1.2.3


From 9087c6ff8dfe0a070e4e05a434399080603c29de Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Feb 2022 10:18:01 -0800
Subject: bpf: Call maybe_wait_bpf_programs() only once from
 generic_map_delete_batch()

As stated in the comment found in maybe_wait_bpf_programs(),
the synchronize_rcu() barrier is only needed before returning
to userspace, not after each deletion in the batch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20220218181801.2971275-1-eric.dumazet@gmail.com
---
 kernel/bpf/syscall.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a72f63d5a7da..9c7a72b65eee 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1352,7 +1352,6 @@ int generic_map_delete_batch(struct bpf_map *map,
 		err = map->ops->map_delete_elem(map, key);
 		rcu_read_unlock();
 		bpf_enable_instrumentation();
-		maybe_wait_bpf_programs(map);
 		if (err)
 			break;
 		cond_resched();
@@ -1361,6 +1360,8 @@ int generic_map_delete_batch(struct bpf_map *map,
 		err = -EFAULT;
 
 	kvfree(key);
+
+	maybe_wait_bpf_programs(map);
 	return err;
 }
 
-- 
cgit v1.2.3


From 4c7485584d48f60b1e742c7c6a3a1fa503d48d97 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 14 Feb 2022 16:52:10 +0000
Subject: sched/preempt: Move PREEMPT_DYNAMIC logic later

The PREEMPT_DYNAMIC logic in kernel/sched/core.c patches static calls
for a bunch of preemption functions. While most are defined prior to
this, the definition of cond_resched() is later in the file, and so we
only have its declarations from include/linux/sched.h.

In subsequent patches we'd like to define some macros alongside the
definition of each of the preemption functions, which we can use within
sched_dynamic_update(). For this to be possible, the PREEMPT_DYNAMIC
logic needs to be placed after the various preemption functions.

As a preparatory step, this patch moves the PREEMPT_DYNAMIC logic after
the various preemption functions, with no other changes -- this is
purely a move.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-2-mark.rutland@arm.com
---
 kernel/sched/core.c | 272 ++++++++++++++++++++++++++--------------------------
 1 file changed, 136 insertions(+), 136 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1e08b02e0cd5..a123ffa8e21c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6555,142 +6555,6 @@ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
 
 #endif /* CONFIG_PREEMPTION */
 
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-#include <linux/entry-common.h>
-
-/*
- * SC:cond_resched
- * SC:might_resched
- * SC:preempt_schedule
- * SC:preempt_schedule_notrace
- * SC:irqentry_exit_cond_resched
- *
- *
- * NONE:
- *   cond_resched               <- __cond_resched
- *   might_resched              <- RET0
- *   preempt_schedule           <- NOP
- *   preempt_schedule_notrace   <- NOP
- *   irqentry_exit_cond_resched <- NOP
- *
- * VOLUNTARY:
- *   cond_resched               <- __cond_resched
- *   might_resched              <- __cond_resched
- *   preempt_schedule           <- NOP
- *   preempt_schedule_notrace   <- NOP
- *   irqentry_exit_cond_resched <- NOP
- *
- * FULL:
- *   cond_resched               <- RET0
- *   might_resched              <- RET0
- *   preempt_schedule           <- preempt_schedule
- *   preempt_schedule_notrace   <- preempt_schedule_notrace
- *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
- */
-
-enum {
-	preempt_dynamic_undefined = -1,
-	preempt_dynamic_none,
-	preempt_dynamic_voluntary,
-	preempt_dynamic_full,
-};
-
-int preempt_dynamic_mode = preempt_dynamic_undefined;
-
-int sched_dynamic_mode(const char *str)
-{
-	if (!strcmp(str, "none"))
-		return preempt_dynamic_none;
-
-	if (!strcmp(str, "voluntary"))
-		return preempt_dynamic_voluntary;
-
-	if (!strcmp(str, "full"))
-		return preempt_dynamic_full;
-
-	return -EINVAL;
-}
-
-void sched_dynamic_update(int mode)
-{
-	/*
-	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
-	 * the ZERO state, which is invalid.
-	 */
-	static_call_update(cond_resched, __cond_resched);
-	static_call_update(might_resched, __cond_resched);
-	static_call_update(preempt_schedule, __preempt_schedule_func);
-	static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-	static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-
-	switch (mode) {
-	case preempt_dynamic_none:
-		static_call_update(cond_resched, __cond_resched);
-		static_call_update(might_resched, (void *)&__static_call_return0);
-		static_call_update(preempt_schedule, NULL);
-		static_call_update(preempt_schedule_notrace, NULL);
-		static_call_update(irqentry_exit_cond_resched, NULL);
-		pr_info("Dynamic Preempt: none\n");
-		break;
-
-	case preempt_dynamic_voluntary:
-		static_call_update(cond_resched, __cond_resched);
-		static_call_update(might_resched, __cond_resched);
-		static_call_update(preempt_schedule, NULL);
-		static_call_update(preempt_schedule_notrace, NULL);
-		static_call_update(irqentry_exit_cond_resched, NULL);
-		pr_info("Dynamic Preempt: voluntary\n");
-		break;
-
-	case preempt_dynamic_full:
-		static_call_update(cond_resched, (void *)&__static_call_return0);
-		static_call_update(might_resched, (void *)&__static_call_return0);
-		static_call_update(preempt_schedule, __preempt_schedule_func);
-		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-		pr_info("Dynamic Preempt: full\n");
-		break;
-	}
-
-	preempt_dynamic_mode = mode;
-}
-
-static int __init setup_preempt_mode(char *str)
-{
-	int mode = sched_dynamic_mode(str);
-	if (mode < 0) {
-		pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
-		return 0;
-	}
-
-	sched_dynamic_update(mode);
-	return 1;
-}
-__setup("preempt=", setup_preempt_mode);
-
-static void __init preempt_dynamic_init(void)
-{
-	if (preempt_dynamic_mode == preempt_dynamic_undefined) {
-		if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
-			sched_dynamic_update(preempt_dynamic_none);
-		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
-			sched_dynamic_update(preempt_dynamic_voluntary);
-		} else {
-			/* Default static call setting, nothing to do */
-			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
-			preempt_dynamic_mode = preempt_dynamic_full;
-			pr_info("Dynamic Preempt: full\n");
-		}
-	}
-}
-
-#else /* !CONFIG_PREEMPT_DYNAMIC */
-
-static inline void preempt_dynamic_init(void) { }
-
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
-
 /*
  * This is the entry point to schedule() from kernel preemption
  * off of irq context.
@@ -8271,6 +8135,142 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_rwlock_write);
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#include <linux/entry-common.h>
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ *   cond_resched               <- __cond_resched
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- NOP
+ *   preempt_schedule_notrace   <- NOP
+ *   irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ *   cond_resched               <- __cond_resched
+ *   might_resched              <- __cond_resched
+ *   preempt_schedule           <- NOP
+ *   preempt_schedule_notrace   <- NOP
+ *   irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ *   cond_resched               <- RET0
+ *   might_resched              <- RET0
+ *   preempt_schedule           <- preempt_schedule
+ *   preempt_schedule_notrace   <- preempt_schedule_notrace
+ *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+	preempt_dynamic_undefined = -1,
+	preempt_dynamic_none,
+	preempt_dynamic_voluntary,
+	preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+	if (!strcmp(str, "none"))
+		return preempt_dynamic_none;
+
+	if (!strcmp(str, "voluntary"))
+		return preempt_dynamic_voluntary;
+
+	if (!strcmp(str, "full"))
+		return preempt_dynamic_full;
+
+	return -EINVAL;
+}
+
+void sched_dynamic_update(int mode)
+{
+	/*
+	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+	 * the ZERO state, which is invalid.
+	 */
+	static_call_update(cond_resched, __cond_resched);
+	static_call_update(might_resched, __cond_resched);
+	static_call_update(preempt_schedule, __preempt_schedule_func);
+	static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+	static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+
+	switch (mode) {
+	case preempt_dynamic_none:
+		static_call_update(cond_resched, __cond_resched);
+		static_call_update(might_resched, (void *)&__static_call_return0);
+		static_call_update(preempt_schedule, NULL);
+		static_call_update(preempt_schedule_notrace, NULL);
+		static_call_update(irqentry_exit_cond_resched, NULL);
+		pr_info("Dynamic Preempt: none\n");
+		break;
+
+	case preempt_dynamic_voluntary:
+		static_call_update(cond_resched, __cond_resched);
+		static_call_update(might_resched, __cond_resched);
+		static_call_update(preempt_schedule, NULL);
+		static_call_update(preempt_schedule_notrace, NULL);
+		static_call_update(irqentry_exit_cond_resched, NULL);
+		pr_info("Dynamic Preempt: voluntary\n");
+		break;
+
+	case preempt_dynamic_full:
+		static_call_update(cond_resched, (void *)&__static_call_return0);
+		static_call_update(might_resched, (void *)&__static_call_return0);
+		static_call_update(preempt_schedule, __preempt_schedule_func);
+		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+		pr_info("Dynamic Preempt: full\n");
+		break;
+	}
+
+	preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+	int mode = sched_dynamic_mode(str);
+	if (mode < 0) {
+		pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+		return 0;
+	}
+
+	sched_dynamic_update(mode);
+	return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+	if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+		if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+			sched_dynamic_update(preempt_dynamic_none);
+		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+			sched_dynamic_update(preempt_dynamic_voluntary);
+		} else {
+			/* Default static call setting, nothing to do */
+			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+			preempt_dynamic_mode = preempt_dynamic_full;
+			pr_info("Dynamic Preempt: full\n");
+		}
+	}
+}
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
 /**
  * yield - yield the current processor to other threads.
  *
-- 
cgit v1.2.3


From 8a69fe0be143b0a1af829f85f0e9a1ae7d6a04db Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 14 Feb 2022 16:52:11 +0000
Subject: sched/preempt: Refactor sched_dynamic_update()

Currently sched_dynamic_update needs to open-code the enabled/disabled
function names for each preemption model it supports, when in practice
this is a boolean enabled/disabled state for each function.

Make this clearer and avoid repetition by defining the enabled/disabled
states at the function definition, and using helper macros to perform the
static_call_update(). Where x86 currently overrides the enabled
function, it is made to provide both the enabled and disabled states for
consistency, with defaults provided by the core code otherwise.

In subsequent patches this will allow us to support PREEMPT_DYNAMIC
without static calls.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-3-mark.rutland@arm.com
---
 kernel/sched/core.c | 59 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a123ffa8e21c..bf3a97f48c1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6491,7 +6491,11 @@ NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled	preempt_schedule
+#define preempt_schedule_dynamic_disabled	NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
 EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
 #endif
 
@@ -6549,7 +6553,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled	preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled	NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
 EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
 #endif
 
@@ -8060,9 +8068,13 @@ EXPORT_SYMBOL(__cond_resched);
 #endif
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
+#define cond_resched_dynamic_enabled	__cond_resched
+#define cond_resched_dynamic_disabled	((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
 EXPORT_STATIC_CALL_TRAMP(cond_resched);
 
+#define might_resched_dynamic_enabled	__cond_resched
+#define might_resched_dynamic_disabled	((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
 EXPORT_STATIC_CALL_TRAMP(might_resched);
 #endif
@@ -8192,43 +8204,46 @@ int sched_dynamic_mode(const char *str)
 	return -EINVAL;
 }
 
+#define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
+
 void sched_dynamic_update(int mode)
 {
 	/*
 	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
 	 * the ZERO state, which is invalid.
 	 */
-	static_call_update(cond_resched, __cond_resched);
-	static_call_update(might_resched, __cond_resched);
-	static_call_update(preempt_schedule, __preempt_schedule_func);
-	static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-	static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+	preempt_dynamic_enable(cond_resched);
+	preempt_dynamic_enable(might_resched);
+	preempt_dynamic_enable(preempt_schedule);
+	preempt_dynamic_enable(preempt_schedule_notrace);
+	preempt_dynamic_enable(irqentry_exit_cond_resched);
 
 	switch (mode) {
 	case preempt_dynamic_none:
-		static_call_update(cond_resched, __cond_resched);
-		static_call_update(might_resched, (void *)&__static_call_return0);
-		static_call_update(preempt_schedule, NULL);
-		static_call_update(preempt_schedule_notrace, NULL);
-		static_call_update(irqentry_exit_cond_resched, NULL);
+		preempt_dynamic_enable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_disable(preempt_schedule);
+		preempt_dynamic_disable(preempt_schedule_notrace);
+		preempt_dynamic_disable(irqentry_exit_cond_resched);
 		pr_info("Dynamic Preempt: none\n");
 		break;
 
 	case preempt_dynamic_voluntary:
-		static_call_update(cond_resched, __cond_resched);
-		static_call_update(might_resched, __cond_resched);
-		static_call_update(preempt_schedule, NULL);
-		static_call_update(preempt_schedule_notrace, NULL);
-		static_call_update(irqentry_exit_cond_resched, NULL);
+		preempt_dynamic_enable(cond_resched);
+		preempt_dynamic_enable(might_resched);
+		preempt_dynamic_disable(preempt_schedule);
+		preempt_dynamic_disable(preempt_schedule_notrace);
+		preempt_dynamic_disable(irqentry_exit_cond_resched);
 		pr_info("Dynamic Preempt: voluntary\n");
 		break;
 
 	case preempt_dynamic_full:
-		static_call_update(cond_resched, (void *)&__static_call_return0);
-		static_call_update(might_resched, (void *)&__static_call_return0);
-		static_call_update(preempt_schedule, __preempt_schedule_func);
-		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+		preempt_dynamic_disable(cond_resched);
+		preempt_dynamic_disable(might_resched);
+		preempt_dynamic_enable(preempt_schedule);
+		preempt_dynamic_enable(preempt_schedule_notrace);
+		preempt_dynamic_enable(irqentry_exit_cond_resched);
 		pr_info("Dynamic Preempt: full\n");
 		break;
 	}
-- 
cgit v1.2.3


From 4624a14f4daa8ab4578d274555fd8847254ce339 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 14 Feb 2022 16:52:12 +0000
Subject: sched/preempt: Simplify irqentry_exit_cond_resched() callers

Currently callers of irqentry_exit_cond_resched() need to be aware of
whether the function should be indirected via a static call, leading to
ugly ifdeffery in callers.

Save them the hassle with a static inline wrapper that does the right
thing. The raw_irqentry_exit_cond_resched() will also be useful in
subsequent patches which will add conditional wrappers for preemption
functions.

Note: in arch/x86/entry/common.c, xen_pv_evtchn_do_upcall() always calls
irqentry_exit_cond_resched() directly, even when PREEMPT_DYNAMIC is in
use. I believe this is a latent bug (which this patch corrects), but I'm
not entirely certain this wasn't deliberate.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-4-mark.rutland@arm.com
---
 kernel/entry/common.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..1739ca79613b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -380,7 +380,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	return ret;
 }
 
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
 {
 	if (!preempt_count()) {
 		/* Sanity check RCU and thread stack */
@@ -392,7 +392,7 @@ void irqentry_exit_cond_resched(void)
 	}
 }
 #ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
 #endif
 
 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -420,13 +420,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 		}
 
 		instrumentation_begin();
-		if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
-			static_call(irqentry_exit_cond_resched)();
-#else
+		if (IS_ENABLED(CONFIG_PREEMPTION))
 			irqentry_exit_cond_resched();
-#endif
-		}
+
 		/* Covers both tracing and lockdep */
 		trace_hardirqs_on();
 		instrumentation_end();
-- 
cgit v1.2.3


From 33c64734be3461222a8aa27d3dadc477ebca62de Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 14 Feb 2022 16:52:13 +0000
Subject: sched/preempt: Decouple HAVE_PREEMPT_DYNAMIC from GENERIC_ENTRY

Now that the enabled/disabled states for the preemption functions are
declared alongside their definitions, the core PREEMPT_DYNAMIC logic is
no longer tied to GENERIC_ENTRY, and can safely be selected so long as
an architecture provides enabled/disabled states for
irqentry_exit_cond_resched().

Make it possible to select HAVE_PREEMPT_DYNAMIC without GENERIC_ENTRY.

For existing users of HAVE_PREEMPT_DYNAMIC there should be no functional
change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-5-mark.rutland@arm.com
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf3a97f48c1d..300c0454a2b8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8149,7 +8149,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
+#ifdef CONFIG_GENERIC_ENTRY
 #include <linux/entry-common.h>
+#endif
 
 /*
  * SC:cond_resched
-- 
cgit v1.2.3


From 99cf983cc8bca4adb461b519664c939a565cfd4d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 14 Feb 2022 16:52:14 +0000
Subject: sched/preempt: Add PREEMPT_DYNAMIC using static keys

Where an architecture selects HAVE_STATIC_CALL but not
HAVE_STATIC_CALL_INLINE, each static call has an out-of-line trampoline
which will either branch to a callee or return to the caller.

On such architectures, a number of constraints can conspire to make
those trampolines more complicated and potentially less useful than we'd
like. For example:

* Hardware and software control flow integrity schemes can require the
  addition of "landing pad" instructions (e.g. `BTI` for arm64), which
  will also be present at the "real" callee.

* Limited branch ranges can require that trampolines generate or load an
  address into a register and perform an indirect branch (or at least
  have a slow path that does so). This loses some of the benefits of
  having a direct branch.

* Interaction with SW CFI schemes can be complicated and fragile, e.g.
  requiring that we can recognise idiomatic codegen and remove
  indirections understand, at least until clang proves more helpful
  mechanisms for dealing with this.

For PREEMPT_DYNAMIC, we don't need the full power of static calls, as we
really only need to enable/disable specific preemption functions. We can
achieve the same effect without a number of the pain points above by
using static keys to fold early returns into the preemption functions
themselves rather than in an out-of-line trampoline, effectively
inlining the trampoline into the start of the function.

For arm64, this results in good code generation. For example, the
dynamic_cond_resched() wrapper looks as follows when enabled. When
disabled, the first `B` is replaced with a `NOP`, resulting in an early
return.

| <dynamic_cond_resched>:
|        bti     c
|        b       <dynamic_cond_resched+0x10>     // or `nop`
|        mov     w0, #0x0
|        ret
|        mrs     x0, sp_el0
|        ldr     x0, [x0, #8]
|        cbnz    x0, <dynamic_cond_resched+0x8>
|        paciasp
|        stp     x29, x30, [sp, #-16]!
|        mov     x29, sp
|        bl      <preempt_schedule_common>
|        mov     w0, #0x1
|        ldp     x29, x30, [sp], #16
|        autiasp
|        ret

... compared to the regular form of the function:

| <__cond_resched>:
|        bti     c
|        mrs     x0, sp_el0
|        ldr     x1, [x0, #8]
|        cbz     x1, <__cond_resched+0x18>
|        mov     w0, #0x0
|        ret
|        paciasp
|        stp     x29, x30, [sp, #-16]!
|        mov     x29, sp
|        bl      <preempt_schedule_common>
|        mov     w0, #0x1
|        ldp     x29, x30, [sp], #16
|        autiasp
|        ret

Any architecture which implements static keys should be able to use this
to implement PREEMPT_DYNAMIC with similar cost to non-inlined static
calls. Since this is likely to have greater overhead than (inlined)
static calls, PREEMPT_DYNAMIC is only defaulted to enabled when
HAVE_PREEMPT_DYNAMIC_CALL is selected.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-6-mark.rutland@arm.com
---
 kernel/Kconfig.preempt |  3 ++-
 kernel/entry/common.c  | 11 ++++++++++
 kernel/sched/core.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -96,8 +96,9 @@ config PREEMPTION
 config PREEMPT_DYNAMIC
 	bool "Preemption behaviour defined on boot"
 	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
 	select PREEMPT_BUILD
-	default y
+	default y if HAVE_PREEMPT_DYNAMIC_CALL
 	help
 	  This option allows to define the preemption model on the kernel
 	  command line parameter and thus override the default preemption
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 1739ca79613b..b145249ad91a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -3,6 +3,7 @@
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
 #include <linux/highmem.h>
+#include <linux/jump_label.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 #include <linux/tick.h>
@@ -392,7 +393,17 @@ void raw_irqentry_exit_cond_resched(void)
 	}
 }
 #ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+	if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+		return;
+	raw_irqentry_exit_cond_resched();
+}
+#endif
 #endif
 
 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 300c0454a2b8..9e65028189f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -14,6 +14,7 @@
 
 #include <linux/nospec.h>
 #include <linux/blkdev.h>
+#include <linux/jump_label.h>
 #include <linux/kcov.h>
 #include <linux/scs.h>
 
@@ -6484,21 +6485,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	 */
 	if (likely(!preemptible()))
 		return;
-
 	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #ifndef preempt_schedule_dynamic_enabled
 #define preempt_schedule_dynamic_enabled	preempt_schedule
 #define preempt_schedule_dynamic_disabled	NULL
 #endif
 DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
 EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+		return;
+	preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
 #endif
-
 
 /**
  * preempt_schedule_notrace - preempt_schedule called by tracing
@@ -6553,12 +6564,24 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #ifndef preempt_schedule_notrace_dynamic_enabled
 #define preempt_schedule_notrace_dynamic_enabled	preempt_schedule_notrace
 #define preempt_schedule_notrace_dynamic_disabled	NULL
 #endif
 DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
 EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
+{
+	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+		return;
+	preempt_schedule_notrace();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
 #endif
 
 #endif /* CONFIG_PREEMPTION */
@@ -8068,6 +8091,7 @@ EXPORT_SYMBOL(__cond_resched);
 #endif
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define cond_resched_dynamic_enabled	__cond_resched
 #define cond_resched_dynamic_disabled	((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
@@ -8077,6 +8101,25 @@ EXPORT_STATIC_CALL_TRAMP(cond_resched);
 #define might_resched_dynamic_disabled	((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
 EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+	if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+		return 0;
+	return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+	if (!static_branch_unlikely(&sk_dynamic_might_resched))
+		return 0;
+	return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
 #endif
 
 /*
@@ -8206,8 +8249,15 @@ int sched_dynamic_mode(const char *str)
 	return -EINVAL;
 }
 
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
 #define preempt_dynamic_enable(f)	static_call_update(f, f##_dynamic_enabled)
 #define preempt_dynamic_disable(f)	static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f)	static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f)	static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
 
 void sched_dynamic_update(int mode)
 {
-- 
cgit v1.2.3


From d0b3822902b6af45f2c75706d7eb2a35aacab223 Mon Sep 17 00:00:00 2001
From: "Souptick Joarder (HPE)" <jrdr.linux@gmail.com>
Date: Sat, 19 Feb 2022 22:09:15 +0530
Subject: bpf: Initialize ret to 0 inside btf_populate_kfunc_set()

Kernel test robot reported below error ->

kernel/bpf/btf.c:6718 btf_populate_kfunc_set()
error: uninitialized symbol 'ret'.

Initialize ret to 0.

Fixes: dee872e124e8 ("bpf: Populate kfunc BTF ID sets in struct btf")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Souptick Joarder (HPE) <jrdr.linux@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20220219163915.125770-1-jrdr.linux@gmail.com
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 02d7014417a0..2c4c5dbe2abe 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6706,7 +6706,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
 				  const struct btf_kfunc_id_set *kset)
 {
 	bool vmlinux_set = !btf_is_module(btf);
-	int type, ret;
+	int type, ret = 0;
 
 	for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
 		if (!kset->sets[type])
-- 
cgit v1.2.3


From 509853f9e1e7b1490dc79f735a5dbafc9298f40d Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 11 Feb 2022 19:14:54 +0100
Subject: genirq: Provide generic_handle_irq_safe()

Provide generic_handle_irq_safe() which can used from any context.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/20220211181500.1856198-2-bigeasy@linutronix.de
---
 kernel/irq/irqdesc.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..346d283d2da1 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq)
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
 
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ *			     context.
+ * @irq:	The irq number to handle
+ *
+ * Returns:	0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = handle_irq_desc(irq_to_desc(irq));
+	local_irq_restore(flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
 #ifdef CONFIG_IRQ_DOMAIN
 /**
  * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
-- 
cgit v1.2.3


From a5a763b2b26678f1e01b2d031819b175d8f14555 Mon Sep 17 00:00:00 2001
From: Andre Kalb <andre.kalb@sma.de>
Date: Wed, 16 Feb 2022 11:41:38 +0100
Subject: printk: Set console_set_on_cmdline=1 when __add_preferred_console()
 is called with user_specified == true

In case of using console="" or console=null
set console_set_on_cmdline=1 to disable "stdout-path" node from DT.

We basically need to set it every time when __add_preferred_console()
is called with parameter 'user_specified' set.
Therefore we can move setting it into a helper function that is
called from __add_preferred_console().

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andre Kalb <andre.kalb@sma.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/YgzU4ho8l6XapyG2@pc6682
---
 kernel/printk/printk.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 25dce8b74791..266cc974b0e3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2323,6 +2323,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
 }
 #endif
 
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+	if (!user_specified)
+		return;
+
+	/*
+	 * @c console was defined by the user on the command line.
+	 * Do not clear when added twice also by SPCR or the device tree.
+	 */
+	c->user_specified = true;
+	/* At least one console defined by the user on the command line. */
+	console_set_on_cmdline = 1;
+}
+
 static int __add_preferred_console(char *name, int idx, char *options,
 				   char *brl_options, bool user_specified)
 {
@@ -2339,8 +2353,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
 		if (strcmp(c->name, name) == 0 && c->index == idx) {
 			if (!brl_options)
 				preferred_console = i;
-			if (user_specified)
-				c->user_specified = true;
+			set_user_specified(c, user_specified);
 			return 0;
 		}
 	}
@@ -2350,7 +2363,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
 		preferred_console = i;
 	strlcpy(c->name, name, sizeof(c->name));
 	c->options = options;
-	c->user_specified = user_specified;
+	set_user_specified(c, user_specified);
 	braille_set_options(c, brl_options);
 
 	c->index = idx;
@@ -2416,7 +2429,6 @@ static int __init console_setup(char *str)
 	*s = 0;
 
 	__add_preferred_console(buf, idx, options, brl_options, true);
-	console_set_on_cmdline = 1;
 	return 1;
 }
 __setup("console=", console_setup);
-- 
cgit v1.2.3


From ce06e863f36f16cdc3b84c7206cd13d5f597d623 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Wed, 16 Feb 2022 11:19:57 +0800
Subject: printk: make suppress_panic_printk static

This symbol is not used outside of printk.c, so marks it static.

Fix the following sparse warning:

kernel/printk/printk.c:100:19: warning: symbol 'suppress_panic_printk'
was not declared. Should it be static?

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220216031957.9761-1-jiapeng.chong@linux.alibaba.com
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6a51907a33b9..f9430ac4caca 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -97,7 +97,7 @@ int __read_mostly suppress_printk;
  * During panic, heavy printk by other CPUs can delay the
  * panic and risk deadlock on console resources.
  */
-int __read_mostly suppress_panic_printk;
+static int __read_mostly suppress_panic_printk;
 
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
-- 
cgit v1.2.3


From 3191dd5a1179ef0fad5a050a1702ae98b6251e8f Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 13 Feb 2022 22:48:04 +0100
Subject: random: clear fast pool, crng, and batches in cpuhp bring up

For the irq randomness fast pool, rather than having to use expensive
atomics, which were visibly the most expensive thing in the entire irq
handler, simply take care of the extreme edge case of resetting count to
zero in the cpuhp online handler, just after workqueues have been
reenabled. This simplifies the code a bit and lets us use vanilla
variables rather than atomics, and performance should be improved.

As well, very early on when the CPU comes up, while interrupts are still
disabled, we clear out the per-cpu crng and its batches, so that it
always starts with fresh randomness.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Sultan Alsawaf <sultan@kerneltoast.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 kernel/cpu.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..238cba15449f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,6 +34,7 @@
 #include <linux/scs.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/cpuset.h>
+#include <linux/random.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= perf_event_init_cpu,
 		.teardown.single	= perf_event_exit_cpu,
 	},
+	[CPUHP_RANDOM_PREPARE] = {
+		.name			= "random:prepare",
+		.startup.single		= random_prepare_cpu,
+		.teardown.single	= NULL,
+	},
 	[CPUHP_WORKQUEUE_PREP] = {
 		.name			= "workqueue:prepare",
 		.startup.single		= workqueue_prepare_cpu,
@@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= workqueue_online_cpu,
 		.teardown.single	= workqueue_offline_cpu,
 	},
+	[CPUHP_AP_RANDOM_ONLINE] = {
+		.name			= "random:online",
+		.startup.single		= random_online_cpu,
+		.teardown.single	= NULL,
+	},
 	[CPUHP_AP_RCUTREE_ONLINE] = {
 		.name			= "RCU/tree:online",
 		.startup.single		= rcutree_online_cpu,
-- 
cgit v1.2.3


From 272ceeaea355214b301530e262a0df8600bfca95 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 22 Feb 2022 11:44:51 -0500
Subject: audit: log AUDIT_TIME_* records only from rules

AUDIT_TIME_* events are generated when there are syscall rules present
that are not related to time keeping.  This will produce noisy log
entries that could flood the logs and hide events we really care about.

Rather than immediately produce the AUDIT_TIME_* records, store the data
in the context and log it at syscall exit time respecting the filter
rules.

Note: This eats the audit_buffer, unlike any others in show_special().

Please see https://bugzilla.redhat.com/show_bug.cgi?id=1991919

Fixes: 7e8eda734d30 ("ntp: Audit NTP parameters adjustment")
Fixes: 2d87a0674bd6 ("timekeeping: Audit clock adjustments")
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: fixed style/whitespace issues]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h   |  4 +++
 kernel/auditsc.c | 87 +++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index c4498090a5bd..58b66543b4d5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -201,6 +201,10 @@ struct audit_context {
 		struct {
 			char			*name;
 		} module;
+		struct {
+			struct audit_ntp_data	ntp_data;
+			struct timespec64	tk_injoffset;
+		} time;
 	};
 	int fds[2];
 	struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fce5d43a933f..0efd75e4730f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 			 from_kuid(&init_user_ns, name->fcap.rootid));
 }
 
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+	const struct audit_ntp_data *ntp = &context->time.ntp_data;
+	const struct timespec64 *tk = &context->time.tk_injoffset;
+	static const char * const ntp_name[] = {
+		"offset",
+		"freq",
+		"status",
+		"tai",
+		"tick",
+		"adjust",
+	};
+	int type;
+
+	if (context->type == AUDIT_TIME_ADJNTPVAL) {
+		for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+			if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+				if (!*ab) {
+					*ab = audit_log_start(context,
+							GFP_KERNEL,
+							AUDIT_TIME_ADJNTPVAL);
+					if (!*ab)
+						return;
+				}
+				audit_log_format(*ab, "op=%s old=%lli new=%lli",
+						 ntp_name[type],
+						 ntp->vals[type].oldval,
+						 ntp->vals[type].newval);
+				audit_log_end(*ab);
+				*ab = NULL;
+			}
+		}
+	}
+	if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+		if (!*ab) {
+			*ab = audit_log_start(context, GFP_KERNEL,
+					      AUDIT_TIME_INJOFFSET);
+			if (!*ab)
+				return;
+		}
+		audit_log_format(*ab, "sec=%lli nsec=%li",
+				 (long long)tk->tv_sec, tk->tv_nsec);
+		audit_log_end(*ab);
+		*ab = NULL;
+	}
+}
+
 static void show_special(struct audit_context *context, int *call_panic)
 {
 	struct audit_buffer *ab;
@@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 			audit_log_format(ab, "(null)");
 
 		break;
+	case AUDIT_TIME_ADJNTPVAL:
+	case AUDIT_TIME_INJOFFSET:
+		/* this call deviates from the rest, eating the buffer */
+		audit_log_time(context, &ab);
+		break;
 	}
 	audit_log_end(ab);
 }
@@ -2849,31 +2901,26 @@ void __audit_fanotify(unsigned int response)
 
 void __audit_tk_injoffset(struct timespec64 offset)
 {
-	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
-		  "sec=%lli nsec=%li",
-		  (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
-			      const char *op, enum audit_ntp_type type)
-{
-	const struct audit_ntp_val *val = &ad->vals[type];
-
-	if (val->newval == val->oldval)
-		return;
+	struct audit_context *context = audit_context();
 
-	audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
-		  "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+	/* only set type if not already set by NTP */
+	if (!context->type)
+		context->type = AUDIT_TIME_INJOFFSET;
+	memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
 }
 
 void __audit_ntp_log(const struct audit_ntp_data *ad)
 {
-	audit_log_ntp_val(ad, "offset",	AUDIT_NTP_OFFSET);
-	audit_log_ntp_val(ad, "freq",	AUDIT_NTP_FREQ);
-	audit_log_ntp_val(ad, "status",	AUDIT_NTP_STATUS);
-	audit_log_ntp_val(ad, "tai",	AUDIT_NTP_TAI);
-	audit_log_ntp_val(ad, "tick",	AUDIT_NTP_TICK);
-	audit_log_ntp_val(ad, "adjust",	AUDIT_NTP_ADJUST);
+	struct audit_context *context = audit_context();
+	int type;
+
+	for (type = 0; type < AUDIT_NTP_NVALS; type++)
+		if (ad->vals[type].newval != ad->vals[type].oldval) {
+			/* unconditionally set type, overwriting TK */
+			context->type = AUDIT_TIME_ADJNTPVAL;
+			memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+			break;
+		}
 }
 
 void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
-- 
cgit v1.2.3


From be9a2277cafd318976d59c41a7f45a934ec43b26 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:23:59 +0100
Subject: fork: Redo ifdefs around task stack handling

The use of ifdef CONFIG_VMAP_STACK is confusing in terms what is
actually happenning and what can happen.

For instance from reading free_thread_stack() it appears that in the
CONFIG_VMAP_STACK case it may receive a non-NULL vm pointer but it may also
be NULL in which case __free_pages() is used to free the stack.  This is
however not the case because in the VMAP case a non-NULL pointer is always
returned here.  Since it looks like this might happen, the compiler creates
the correct dead code with the invocation to __free_pages() and everything
around it. Twice.

Add spaces between the ifdef and the identifer to recognize the ifdef
level which is currently in scope.

Add the current identifer as a comment behind #else and #endif.
Move the code within free_thread_stack() and alloc_thread_stack_node()
into the relevant ifdef blocks.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-2-bigeasy@linutronix.de
---
 kernel/fork.c | 74 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a024bf6254df..f5cc10164334 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -185,7 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk)
  */
 # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
 
-#ifdef CONFIG_VMAP_STACK
+#  ifdef CONFIG_VMAP_STACK
 /*
  * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
  * flush.  Try to minimize the number of calls by caching stacks.
@@ -210,11 +210,9 @@ static int free_vm_stack_cache(unsigned int cpu)
 
 	return 0;
 }
-#endif
 
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
-#ifdef CONFIG_VMAP_STACK
 	void *stack;
 	int i;
 
@@ -258,45 +256,53 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		tsk->stack = stack;
 	}
 	return stack;
-#else
-	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
-					     THREAD_SIZE_ORDER);
-
-	if (likely(page)) {
-		tsk->stack = kasan_reset_tag(page_address(page));
-		return tsk->stack;
-	}
-	return NULL;
-#endif
 }
 
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
 {
-#ifdef CONFIG_VMAP_STACK
 	struct vm_struct *vm = task_stack_vm_area(tsk);
+	int i;
 
-	if (vm) {
-		int i;
+	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+		memcg_kmem_uncharge_page(vm->pages[i], 0);
 
-		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
-			memcg_kmem_uncharge_page(vm->pages[i], 0);
+	for (i = 0; i < NR_CACHED_STACKS; i++) {
+		if (this_cpu_cmpxchg(cached_stacks[i], NULL,
+				     tsk->stack_vm_area) != NULL)
+			continue;
 
-		for (i = 0; i < NR_CACHED_STACKS; i++) {
-			if (this_cpu_cmpxchg(cached_stacks[i],
-					NULL, tsk->stack_vm_area) != NULL)
-				continue;
+		tsk->stack = NULL;
+		tsk->stack_vm_area = NULL;
+		return;
+	}
+	vfree_atomic(tsk->stack);
+	tsk->stack = NULL;
+	tsk->stack_vm_area = NULL;
+}
 
-			return;
-		}
+#  else /* !CONFIG_VMAP_STACK */
 
-		vfree_atomic(tsk->stack);
-		return;
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+					     THREAD_SIZE_ORDER);
+
+	if (likely(page)) {
+		tsk->stack = kasan_reset_tag(page_address(page));
+		return tsk->stack;
 	}
-#endif
+	return NULL;
+}
 
+static void free_thread_stack(struct task_struct *tsk)
+{
 	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+	tsk->stack = NULL;
 }
-# else
+
+#  endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+
 static struct kmem_cache *thread_stack_cache;
 
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
@@ -312,6 +318,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 static void free_thread_stack(struct task_struct *tsk)
 {
 	kmem_cache_free(thread_stack_cache, tsk->stack);
+	tsk->stack = NULL;
 }
 
 void thread_stack_cache_init(void)
@@ -321,8 +328,9 @@ void thread_stack_cache_init(void)
 					THREAD_SIZE, NULL);
 	BUG_ON(thread_stack_cache == NULL);
 }
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
@@ -432,10 +440,6 @@ static void release_task_stack(struct task_struct *tsk)
 
 	account_kernel_stack(tsk, -1);
 	free_thread_stack(tsk);
-	tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
-	tsk->stack_vm_area = NULL;
-#endif
 }
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
-- 
cgit v1.2.3


From 546c42b2c5c161619736dd730d3df709181999d0 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:00 +0100
Subject: fork: Duplicate task_struct before stack allocation

alloc_thread_stack_node() already populates the task_struct::stack
member except on IA64. The stack pointer is saved and populated again
because IA64 needs it and arch_dup_task_struct() overwrites it.

Allocate thread's stack after task_struct has been duplicated as a
preparation for further changes.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-3-bigeasy@linutronix.de
---
 kernel/fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index f5cc10164334..30c01ce2ae57 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -888,6 +888,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!tsk)
 		return NULL;
 
+	err = arch_dup_task_struct(tsk, orig);
+	if (err)
+		goto free_tsk;
+
 	stack = alloc_thread_stack_node(tsk, node);
 	if (!stack)
 		goto free_tsk;
@@ -897,8 +901,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	stack_vm_area = task_stack_vm_area(tsk);
 
-	err = arch_dup_task_struct(tsk, orig);
-
 	/*
 	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
 	 * sure they're properly initialized before using any stack-related
@@ -912,9 +914,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	refcount_set(&tsk->stack_refcount, 1);
 #endif
 
-	if (err)
-		goto free_stack;
-
 	err = scs_prepare(tsk, node);
 	if (err)
 		goto free_stack;
-- 
cgit v1.2.3


From 2bb0529c0bc0698f3baf3e88ffd61a18eef252a7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:01 +0100
Subject: fork, IA64: Provide alloc_thread_stack_node() for IA64

Provide a generic alloc_thread_stack_node() for IA64 and
CONFIG_ARCH_THREAD_STACK_ALLOCATOR which returns stack pointer and sets
task_struct::stack so it behaves exactly like the other implementations.

Rename IA64's alloc_thread_stack_node() and add the generic version to the
fork code so it is in one place _and_ to drastically lower the chances of
fat fingering the IA64 code.  Do the same for free_thread_stack().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-4-bigeasy@linutronix.de
---
 kernel/fork.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 30c01ce2ae57..7b70c4741072 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,23 @@ void thread_stack_cache_init(void)
 }
 
 # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+	unsigned long *stack;
+
+	stack = arch_alloc_thread_stack_node(tsk, node);
+	tsk->stack = stack;
+	return stack;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+	arch_free_thread_stack(tsk);
+	tsk->stack = NULL;
+}
+
 #endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-- 
cgit v1.2.3


From 7865aba3ade4cf30f0ac08e015550084a50d9afb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:02 +0100
Subject: fork: Don't assign the stack pointer in dup_task_struct()

All four versions of alloc_thread_stack_node() assign now
task_struct::stack in case the allocation was successful.

Let alloc_thread_stack_node() return an error code instead of the stack
pointer and remove the stack assignment in dup_task_struct().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-5-bigeasy@linutronix.de
---
 kernel/fork.c | 47 ++++++++++++++++-------------------------------
 1 file changed, 16 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7b70c4741072..875bd43f02ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,7 +211,7 @@ static int free_vm_stack_cache(unsigned int cpu)
 	return 0;
 }
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	void *stack;
 	int i;
@@ -232,7 +232,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 
 		tsk->stack_vm_area = s;
 		tsk->stack = s->addr;
-		return s->addr;
+		return 0;
 	}
 
 	/*
@@ -245,17 +245,16 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 				     THREADINFO_GFP & ~__GFP_ACCOUNT,
 				     PAGE_KERNEL,
 				     0, node, __builtin_return_address(0));
-
+	if (!stack)
+		return -ENOMEM;
 	/*
 	 * We can't call find_vm_area() in interrupt context, and
 	 * free_thread_stack() can be called in interrupt context,
 	 * so cache the vm_struct.
 	 */
-	if (stack) {
-		tsk->stack_vm_area = find_vm_area(stack);
-		tsk->stack = stack;
-	}
-	return stack;
+	tsk->stack_vm_area = find_vm_area(stack);
+	tsk->stack = stack;
+	return 0;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
@@ -282,16 +281,16 @@ static void free_thread_stack(struct task_struct *tsk)
 
 #  else /* !CONFIG_VMAP_STACK */
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
 	if (likely(page)) {
 		tsk->stack = kasan_reset_tag(page_address(page));
-		return tsk->stack;
+		return 0;
 	}
-	return NULL;
+	return -ENOMEM;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
@@ -305,14 +304,13 @@ static void free_thread_stack(struct task_struct *tsk)
 
 static struct kmem_cache *thread_stack_cache;
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-						  int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	unsigned long *stack;
 	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 	stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
-	return stack;
+	return stack ? 0 : -ENOMEM;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
@@ -332,13 +330,13 @@ void thread_stack_cache_init(void)
 # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
 #else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	unsigned long *stack;
 
 	stack = arch_alloc_thread_stack_node(tsk, node);
 	tsk->stack = stack;
-	return stack;
+	return stack ? 0 : -ENOMEM;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
@@ -895,8 +893,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
-	unsigned long *stack;
-	struct vm_struct *stack_vm_area __maybe_unused;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -909,24 +905,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (err)
 		goto free_tsk;
 
-	stack = alloc_thread_stack_node(tsk, node);
-	if (!stack)
+	err = alloc_thread_stack_node(tsk, node);
+	if (err)
 		goto free_tsk;
 
 	if (memcg_charge_kernel_stack(tsk))
 		goto free_stack;
 
-	stack_vm_area = task_stack_vm_area(tsk);
-
-	/*
-	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
-	 * sure they're properly initialized before using any stack-related
-	 * functions again.
-	 */
-	tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
-	tsk->stack_vm_area = stack_vm_area;
-#endif
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	refcount_set(&tsk->stack_refcount, 1);
 #endif
-- 
cgit v1.2.3


From f1c1a9ee00e4c53c9ccc03ec1aff4792948a25eb Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:03 +0100
Subject: fork: Move memcg_charge_kernel_stack() into CONFIG_VMAP_STACK

memcg_charge_kernel_stack() is only used in the CONFIG_VMAP_STACK case.

Move memcg_charge_kernel_stack() into the CONFIG_VMAP_STACK block and
invoke it from within alloc_thread_stack_node().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-6-bigeasy@linutronix.de
---
 kernel/fork.c | 69 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 875bd43f02ca..ac63e7fa8816 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,6 +211,32 @@ static int free_vm_stack_cache(unsigned int cpu)
 	return 0;
 }
 
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+	struct vm_struct *vm = task_stack_vm_area(tsk);
+	int i;
+	int ret;
+
+	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+	BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+		ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+		if (ret)
+			goto err;
+	}
+	return 0;
+err:
+	/*
+	 * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+	 * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+	 * ignore this page.
+	 */
+	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+		memcg_kmem_uncharge_page(vm->pages[i], 0);
+	return ret;
+}
+
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	void *stack;
@@ -230,6 +256,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
 
+		if (memcg_charge_kernel_stack(tsk)) {
+			vfree(s->addr);
+			return -ENOMEM;
+		}
+
 		tsk->stack_vm_area = s;
 		tsk->stack = s->addr;
 		return 0;
@@ -247,6 +278,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 				     0, node, __builtin_return_address(0));
 	if (!stack)
 		return -ENOMEM;
+
+	if (memcg_charge_kernel_stack(tsk)) {
+		vfree(stack);
+		return -ENOMEM;
+	}
 	/*
 	 * We can't call find_vm_area() in interrupt context, and
 	 * free_thread_stack() can be called in interrupt context,
@@ -418,36 +454,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 	}
 }
 
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
-{
-#ifdef CONFIG_VMAP_STACK
-	struct vm_struct *vm = task_stack_vm_area(tsk);
-	int ret;
-
-	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
-	if (vm) {
-		int i;
-
-		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
-		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-			/*
-			 * If memcg_kmem_charge_page() fails, page's
-			 * memory cgroup pointer is NULL, and
-			 * memcg_kmem_uncharge_page() in free_thread_stack()
-			 * will ignore this page.
-			 */
-			ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
-						     0);
-			if (ret)
-				return ret;
-		}
-	}
-#endif
-	return 0;
-}
-
 static void release_task_stack(struct task_struct *tsk)
 {
 	if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
@@ -909,9 +915,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (err)
 		goto free_tsk;
 
-	if (memcg_charge_kernel_stack(tsk))
-		goto free_stack;
-
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	refcount_set(&tsk->stack_refcount, 1);
 #endif
-- 
cgit v1.2.3


From 1a03d3f13ffe5dd24142d6db629e72c11b704d99 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:04 +0100
Subject: fork: Move task stack accounting to do_exit()

There is no need to perform the stack accounting of the outgoing task in
its final schedule() invocation which happens with preemption disabled.
The task is leaving, the resources will be freed and the accounting can
happen in do_exit() before the actual schedule invocation which
frees the stack memory.

Move the accounting of the stack memory from release_task_stack() to
exit_task_stack_account() which then can be invoked from do_exit().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-7-bigeasy@linutronix.de
---
 kernel/exit.c |  1 +
 kernel/fork.c | 35 +++++++++++++++++++++++------------
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..c303cffe7fdb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -845,6 +845,7 @@ void __noreturn do_exit(long code)
 		put_page(tsk->task_frag.page);
 
 	validate_creds_for_do_exit(tsk);
+	exit_task_stack_account(tsk);
 
 	check_stack_usage();
 	preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index ac63e7fa8816..25828127db8d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -211,9 +211,8 @@ static int free_vm_stack_cache(unsigned int cpu)
 	return 0;
 }
 
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
 {
-	struct vm_struct *vm = task_stack_vm_area(tsk);
 	int i;
 	int ret;
 
@@ -239,6 +238,7 @@ err:
 
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+	struct vm_struct *vm;
 	void *stack;
 	int i;
 
@@ -256,7 +256,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
 
-		if (memcg_charge_kernel_stack(tsk)) {
+		if (memcg_charge_kernel_stack(s)) {
 			vfree(s->addr);
 			return -ENOMEM;
 		}
@@ -279,7 +279,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	if (!stack)
 		return -ENOMEM;
 
-	if (memcg_charge_kernel_stack(tsk)) {
+	vm = find_vm_area(stack);
+	if (memcg_charge_kernel_stack(vm)) {
 		vfree(stack);
 		return -ENOMEM;
 	}
@@ -288,19 +289,15 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * free_thread_stack() can be called in interrupt context,
 	 * so cache the vm_struct.
 	 */
-	tsk->stack_vm_area = find_vm_area(stack);
+	tsk->stack_vm_area = vm;
 	tsk->stack = stack;
 	return 0;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
 {
-	struct vm_struct *vm = task_stack_vm_area(tsk);
 	int i;
 
-	for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
-		memcg_kmem_uncharge_page(vm->pages[i], 0);
-
 	for (i = 0; i < NR_CACHED_STACKS; i++) {
 		if (this_cpu_cmpxchg(cached_stacks[i], NULL,
 				     tsk->stack_vm_area) != NULL)
@@ -454,12 +451,25 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 	}
 }
 
+void exit_task_stack_account(struct task_struct *tsk)
+{
+	account_kernel_stack(tsk, -1);
+
+	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+		struct vm_struct *vm;
+		int i;
+
+		vm = task_stack_vm_area(tsk);
+		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+			memcg_kmem_uncharge_page(vm->pages[i], 0);
+	}
+}
+
 static void release_task_stack(struct task_struct *tsk)
 {
 	if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
 		return;  /* Better to leak the stack than to free prematurely */
 
-	account_kernel_stack(tsk, -1);
 	free_thread_stack(tsk);
 }
 
@@ -918,6 +928,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	refcount_set(&tsk->stack_refcount, 1);
 #endif
+	account_kernel_stack(tsk, 1);
 
 	err = scs_prepare(tsk, node);
 	if (err)
@@ -961,8 +972,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->wake_q.next = NULL;
 	tsk->worker_private = NULL;
 
-	account_kernel_stack(tsk, 1);
-
 	kcov_task_init(tsk);
 	kmap_local_fork(tsk);
 
@@ -981,6 +990,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	return tsk;
 
 free_stack:
+	exit_task_stack_account(tsk);
 	free_thread_stack(tsk);
 free_tsk:
 	free_task_struct(tsk);
@@ -2459,6 +2469,7 @@ bad_fork_cleanup_count:
 	exit_creds(p);
 bad_fork_free:
 	WRITE_ONCE(p->__state, TASK_DEAD);
+	exit_task_stack_account(p);
 	put_task_stack(p);
 	delayed_free_task(p);
 fork_out:
-- 
cgit v1.2.3


From e540bf3162e822d7a1f07e69e3bb1b4f925ca368 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:05 +0100
Subject: fork: Only cache the VMAP stack in finish_task_switch()

The task stack could be deallocated later, but for fork()/exec() kind of
workloads (say a shell script executing several commands) it is important
that the stack is released in finish_task_switch() so that in VMAP_STACK
case it can be cached and reused in the new task.

For PREEMPT_RT it would be good if the wake-up in vfree_atomic() could
be avoided in the scheduling path. Far worse are the other
free_thread_stack() implementations which invoke __free_pages()/
kmem_cache_free() with disabled preemption.

Cache the stack in free_thread_stack() in the VMAP_STACK case and
RCU-delay the free path otherwise. Free the stack in the RCU callback.
In the VMAP_STACK case this is another opportunity to fill the cache.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-8-bigeasy@linutronix.de
---
 kernel/fork.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 63 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 25828127db8d..177bc64078cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,41 @@ static inline void free_task_struct(struct task_struct *tsk)
 #define NR_CACHED_STACKS 2
 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
 
+struct vm_stack {
+	struct rcu_head rcu;
+	struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+	unsigned int i;
+
+	for (i = 0; i < NR_CACHED_STACKS; i++) {
+		if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+			continue;
+		return true;
+	}
+	return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+	struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+	if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+		return;
+
+	vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+	struct vm_stack *vm_stack = tsk->stack;
+
+	vm_stack->stack_vm_area = tsk->stack_vm_area;
+	call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
 static int free_vm_stack_cache(unsigned int cpu)
 {
 	struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -296,24 +331,27 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 
 static void free_thread_stack(struct task_struct *tsk)
 {
-	int i;
+	if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+		thread_stack_delayed_free(tsk);
 
-	for (i = 0; i < NR_CACHED_STACKS; i++) {
-		if (this_cpu_cmpxchg(cached_stacks[i], NULL,
-				     tsk->stack_vm_area) != NULL)
-			continue;
-
-		tsk->stack = NULL;
-		tsk->stack_vm_area = NULL;
-		return;
-	}
-	vfree_atomic(tsk->stack);
 	tsk->stack = NULL;
 	tsk->stack_vm_area = NULL;
 }
 
 #  else /* !CONFIG_VMAP_STACK */
 
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+	__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+	struct rcu_head *rh = tsk->stack;
+
+	call_rcu(rh, thread_stack_free_rcu);
+}
+
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
@@ -328,7 +366,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 
 static void free_thread_stack(struct task_struct *tsk)
 {
-	__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+	thread_stack_delayed_free(tsk);
 	tsk->stack = NULL;
 }
 
@@ -337,6 +375,18 @@ static void free_thread_stack(struct task_struct *tsk)
 
 static struct kmem_cache *thread_stack_cache;
 
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+	kmem_cache_free(thread_stack_cache, rh);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+	struct rcu_head *rh = tsk->stack;
+
+	call_rcu(rh, thread_stack_free_rcu);
+}
+
 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
 	unsigned long *stack;
@@ -348,7 +398,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 
 static void free_thread_stack(struct task_struct *tsk)
 {
-	kmem_cache_free(thread_stack_cache, tsk->stack);
+	thread_stack_delayed_free(tsk);
 	tsk->stack = NULL;
 }
 
-- 
cgit v1.2.3


From 0ce055f85335e48bc571114d61a70ae217039362 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Feb 2022 11:24:06 +0100
Subject: fork: Use IS_ENABLED() in account_kernel_stack()

Not strickly needed but checking CONFIG_VMAP_STACK instead of
task_stack_vm_area()' result allows the compiler the remove the else
path in the CONFIG_VMAP_STACK case where the pointer can't be NULL.

Check for CONFIG_VMAP_STACK in order to use the proper path.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220217102406.3697941-9-bigeasy@linutronix.de
---
 kernel/fork.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 177bc64078cd..1279b57c4ad9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -485,16 +485,16 @@ void vm_area_free(struct vm_area_struct *vma)
 
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-	void *stack = task_stack_page(tsk);
-	struct vm_struct *vm = task_stack_vm_area(tsk);
-
-	if (vm) {
+	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+		struct vm_struct *vm = task_stack_vm_area(tsk);
 		int i;
 
 		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
 			mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
 					      account * (PAGE_SIZE / 1024));
 	} else {
+		void *stack = task_stack_page(tsk);
+
 		/* All stack pages are in the same node. */
 		mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
 				      account * (THREAD_SIZE / 1024));
-- 
cgit v1.2.3


From 73bd66d9c834220579c881a3eb020fd8917075d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 9 Feb 2022 09:28:28 +0100
Subject: scsi: block: Remove REQ_OP_WRITE_SAME support

No more users of REQ_OP_WRITE_SAME or drivers implementing it are left,
so remove the infrastructure.

[mkp: fold in and tweak sysfs reporting fix]

Link: https://lore.kernel.org/r/20220209082828.2629273-8-hch@lst.de
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 kernel/trace/blktrace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index af68a67179b4..19514edc44f7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1892,7 +1892,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
 
 	switch (op & REQ_OP_MASK) {
 	case REQ_OP_WRITE:
-	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
 		break;
 	case REQ_OP_DISCARD:
-- 
cgit v1.2.3


From 95458477f5b2dc436e3aa6aa25c0f84bb83e6195 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 22 Feb 2022 14:50:43 +0100
Subject: sched/headers: Add header guard to kernel/sched/sched.h

Use the canonical header guard naming of the full path to the header.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3da5718cd641..eab4a18f71c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,9 @@
 /*
  * Scheduler internal types and methods:
  */
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+
 #include <linux/sched.h>
 
 #include <linux/sched/autogroup.h>
@@ -3137,3 +3140,4 @@ extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
 
+#endif /* _KERNEL_SCHED_SCHED_H */
-- 
cgit v1.2.3


From d90a2f160a1cd9a1745896c381afdf8d2812fd6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 20 Nov 2021 10:39:20 +0100
Subject: sched/headers: Add header guard to kernel/sched/stats.h and
 kernel/sched/autogroup.h

Protect against multiple inclusion.

Also include "sched.h" in "stat.h", as it relies on it.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/autogroup.h | 5 +++++
 kernel/sched/stats.h     | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 90fcbfdd70c3..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 
 struct autogroup {
@@ -59,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 }
 
 #endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 3a3c826dd83a..edc0d13fc61b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,7 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
 
 #ifdef CONFIG_SCHEDSTATS
 
+#include "sched.h"
+
 extern struct static_key_false sched_schedstats;
 
 /*
@@ -298,3 +302,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
 # define sched_info_dequeue(rq, t)	do { } while (0)
 # define sched_info_switch(rq, t, next)	do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
+
+#endif /* _KERNEL_STATS_H */
-- 
cgit v1.2.3


From fa28abed7a84f5c9902fe28b3bb58f7900583e83 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 21 Jun 2021 08:41:43 +0200
Subject: sched/headers: sched/clock: Mark all functions 'notrace', remove
 CC_FLAGS_FTRACE build asymmetry

Mark all non-init functions in kernel/sched.c as 'notrace', instead of
turning them all off via CC_FLAGS_FTRACE.

This is going to allow the treatment of this file as any other scheduler
file, and it can be #include-ed in compound compilation units as well.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/Makefile |  3 ---
 kernel/sched/clock.c  | 42 +++++++++++++++++++++---------------------
 2 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c83b37af155b..c0c52026ad0d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
 
 # The compilers are complaining about unused variables inside an if(0) scope
 # block. This is daft, shut them up.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c2b2859ddd82..540d0e50e31c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -61,7 +61,7 @@
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
 {
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
@@ -95,28 +95,28 @@ struct sched_clock_data {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
 
-static inline struct sched_clock_data *this_scd(void)
+notrace static inline struct sched_clock_data *this_scd(void)
 {
 	return this_cpu_ptr(&sched_clock_data);
 }
 
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
 {
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
 {
 	return static_branch_likely(&__sched_clock_stable);
 }
 
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
 {
 	scd->tick_gtod = ktime_get_ns();
 	scd->tick_raw = sched_clock();
 }
 
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
 {
 	struct sched_clock_data *scd;
 
@@ -151,7 +151,7 @@ static void __set_sched_clock_stable(void)
  * The only way to fully avoid random clock jumps is to boot with:
  * "tsc=unstable".
  */
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
 {
 	struct sched_clock_data *scd;
 	int cpu;
@@ -177,7 +177,7 @@ static void __sched_clock_work(struct work_struct *work)
 
 static DECLARE_WORK(sched_clock_work, __sched_clock_work);
 
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
 {
 	if (!sched_clock_stable())
 		return;
@@ -186,7 +186,7 @@ static void __clear_sched_clock_stable(void)
 	schedule_work(&sched_clock_work);
 }
 
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
 {
 	__sched_clock_stable_early = 0;
 
@@ -196,7 +196,7 @@ void clear_sched_clock_stable(void)
 		__clear_sched_clock_stable();
 }
 
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
 {
 	struct sched_clock_data *scd = this_scd();
 
@@ -246,12 +246,12 @@ late_initcall(sched_clock_init_late);
  * min, max except they take wrapping into account
  */
 
-static inline u64 wrap_min(u64 x, u64 y)
+notrace static inline u64 wrap_min(u64 x, u64 y)
 {
 	return (s64)(x - y) < 0 ? x : y;
 }
 
-static inline u64 wrap_max(u64 x, u64 y)
+notrace static inline u64 wrap_max(u64 x, u64 y)
 {
 	return (s64)(x - y) > 0 ? x : y;
 }
@@ -262,7 +262,7 @@ static inline u64 wrap_max(u64 x, u64 y)
  *  - filter out backward motion
  *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
-static u64 sched_clock_local(struct sched_clock_data *scd)
+notrace static u64 sched_clock_local(struct sched_clock_data *scd)
 {
 	u64 now, clock, old_clock, min_clock, max_clock, gtod;
 	s64 delta;
@@ -295,7 +295,7 @@ again:
 	return clock;
 }
 
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
 {
 	struct sched_clock_data *my_scd = this_scd();
 	u64 this_clock, remote_clock;
@@ -362,7 +362,7 @@ again:
  *
  * See cpu_clock().
  */
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
 {
 	struct sched_clock_data *scd;
 	u64 clock;
@@ -386,7 +386,7 @@ u64 sched_clock_cpu(int cpu)
 }
 EXPORT_SYMBOL_GPL(sched_clock_cpu);
 
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd;
 
@@ -403,7 +403,7 @@ void sched_clock_tick(void)
 	sched_clock_local(scd);
 }
 
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
 {
 	if (!sched_clock_stable())
 		return;
@@ -423,7 +423,7 @@ void sched_clock_tick_stable(void)
 /*
  * We are going deep-idle (irqs are disabled):
  */
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
 {
 	sched_clock_cpu(smp_processor_id());
 }
@@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 /*
  * We just idled; resync with ktime.
  */
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
 {
 	unsigned long flags;
 
@@ -458,7 +458,7 @@ void __init sched_clock_init(void)
 	local_irq_enable();
 }
 
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
 {
 	if (!static_branch_likely(&sched_clock_running))
 		return 0;
@@ -476,7 +476,7 @@ u64 sched_clock_cpu(int cpu)
  * On bare metal this function should return the same as local_clock.
  * Architectures and sub-architectures can override this.
  */
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
 {
 	return local_clock();
 }
-- 
cgit v1.2.3


From 81de6572fe980a98a1c6c5eacdfd2a9137894f32 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 21 Jun 2021 08:50:48 +0200
Subject: sched/headers: Fix comment typo in kernel/sched/cpudeadline.c

File name changed.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/cpudeadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ceb03d76c0cc..0e196f0de492 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- *  kernel/sched/cpudl.c
+ *  kernel/sched/cpudeadline.c
  *
  *  Global CPU deadline management
  *
-- 
cgit v1.2.3


From 801c141955108fb7cf1244dda76e6de8b16fd3ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 22 Feb 2022 13:23:24 +0100
Subject: sched/headers: Introduce kernel/sched/build_utility.c and build
 multiple .c files there

Collect all utility functionality source code files into a single kernel/sched/build_utility.c file,
via #include-ing the .c files:

    kernel/sched/clock.c
    kernel/sched/completion.c
    kernel/sched/loadavg.c
    kernel/sched/swait.c
    kernel/sched/wait_bit.c
    kernel/sched/wait.c

CONFIG_CPU_FREQ:
    kernel/sched/cpufreq.c

CONFIG_CPU_FREQ_GOV_SCHEDUTIL:
    kernel/sched/cpufreq_schedutil.c

CONFIG_CGROUP_CPUACCT:
    kernel/sched/cpuacct.c

CONFIG_SCHED_DEBUG:
    kernel/sched/debug.c

CONFIG_SCHEDSTATS:
    kernel/sched/stats.c

CONFIG_SMP:
   kernel/sched/cpupri.c
   kernel/sched/stop_task.c
   kernel/sched/topology.c

CONFIG_SCHED_CORE:
   kernel/sched/core_sched.c

CONFIG_PSI:
   kernel/sched/psi.c

CONFIG_MEMBARRIER:
   kernel/sched/membarrier.c

CONFIG_CPU_ISOLATION:
   kernel/sched/isolation.c

CONFIG_SCHED_AUTOGROUP:
   kernel/sched/autogroup.c

The goal is to amortize the 60+ KLOC header bloat from over a dozen build units into
a single build unit.

The build time of build_utility.c also roughly matches the build time of core.c and
fair.c - allowing better load-balancing of scheduler-only rebuilds.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/Makefile            | 25 ++++++--------
 kernel/sched/autogroup.c         |  3 +-
 kernel/sched/build_utility.c     | 70 ++++++++++++++++++++++++++++++++++++++++
 kernel/sched/clock.c             |  2 --
 kernel/sched/completion.c        |  2 +-
 kernel/sched/core_sched.c        |  3 --
 kernel/sched/cpuacct.c           |  3 +-
 kernel/sched/cpufreq.c           |  3 --
 kernel/sched/cpufreq_schedutil.c |  7 ----
 kernel/sched/cpupri.c            |  1 -
 kernel/sched/debug.c             |  1 -
 kernel/sched/isolation.c         |  1 -
 kernel/sched/loadavg.c           |  1 -
 kernel/sched/membarrier.c        |  1 -
 kernel/sched/psi.c               | 15 ---------
 kernel/sched/sched.h             | 57 ++++++++++++++++++++++++++++++--
 kernel/sched/stats.c             |  1 -
 kernel/sched/stop_task.c         |  1 -
 kernel/sched/swait.c             |  1 -
 kernel/sched/topology.c          |  1 -
 kernel/sched/wait.c              |  1 -
 kernel/sched/wait_bit.c          |  2 +-
 22 files changed, 139 insertions(+), 63 deletions(-)
 create mode 100644 kernel/sched/build_utility.c

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c0c52026ad0d..4a4785cb3cd2 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
-obj-$(CONFIG_SCHED_CORE) += core_sched.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_utility.o
+obj-y += idle.o rt.o deadline.o cputime.o cpudeadline.o pelt.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 31dd2593145e..16092b49ff6a 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+
 /*
  * Auto-group scheduling implementation:
  */
-#include <linux/nospec.h>
-#include "sched.h"
 
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..31216ce0b4b3
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ *   comparable to core.c, fair.c, smp.c and policy.c, the other
+ *   big compilation units. This helps balance build time, while
+ *   coalescing source files to amortize header inclusion
+ *   cost. )
+ */
+
+#include "sched.h"
+#include "sched-pelt.h"
+
+#include <linux/sched_clock.h>
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 540d0e50e31c..d9272d9061a3 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -53,8 +53,6 @@
  * that is otherwise invisible (TSC gets stopped).
  *
  */
-#include "sched.h"
-#include <linux/sched_clock.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..35f15c26ed54 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+
 /*
  * Generic wait-for-completion handler;
  *
@@ -11,7 +12,6 @@
  * typically be used for exclusion which gives rise to priority inversion.
  * Waiting for completion is a typically sync point, but not an exclusion point.
  */
-#include "sched.h"
 
 /**
  * complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index c8746a9a7ada..38a2cec21014 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,8 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <linux/prctl.h>
-#include "sched.h"
-
 /*
  * A simple wrapper around refcount. An allocated sched_core_cookie's
  * address is used to compute the cookie of the task.
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 3d06c5e4220d..2c505cf800aa 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
-#include <asm/irq_regs.h>
-#include "sched.h"
 
 /* Time spent by the tasks of the CPU accounting group executing in ... */
 enum cpuacct_stat_index {
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
  * Copyright (C) 2016, Intel Corporation
  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  */
-#include <linux/cpufreq.h>
-
-#include "sched.h"
 
 DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6d65ab6e484e..f68885d049f2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
 #define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8)
 
 struct sugov_tunables {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d583f2aa744e..fa9ce9d83683 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,7 +22,6 @@
  *  worst case complexity of O(min(101, nr_domcpus)), though the scenario that
  *  yields the worst case search is fairly contrived.
  */
-#include "sched.h"
 
 /*
  * p->rt_priority   p->prio   newpri   cpupri
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 102d6f70e84d..bb3d63bdf4ae 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,7 +6,6 @@
  *
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  */
-#include "sched.h"
 
 /*
  * This allows printing both to /proc/sched_debug and
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b4d10815c45a..373d42c707bc 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,7 +7,6 @@
  * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
  *
  */
-#include "sched.h"
 
 enum hk_flags {
 	HK_FLAG_TIMER		= BIT(HK_TYPE_TIMER),
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 954b229868d9..52c8f8226b0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,7 +6,6 @@
  * figure. Its a silly number but people think its important. We go through
  * great pains to make it work on big machines and tickless kernels.
  */
-#include "sched.h"
 
 /*
  * Global load-average calculations
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3d2825408e3a..0c5be7ebb1dc 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,7 +4,6 @@
  *
  * membarrier system call
  */
-#include "sched.h"
 
 /*
  * For documentation purposes, here are some membarrier ordering
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8fb08a12f602..a4fa3aadfcba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -137,21 +137,6 @@
  * sampling of the aggregate task states would be.
  */
 
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
 static int psi_bug __read_mostly;
 
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eab4a18f71c2..79c7a8a2be65 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,7 +6,25 @@
 #define _KERNEL_SCHED_SCHED_H
 
 #include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/ktime_api.h>
+#include <linux/cgroup_api.h>
+#include <linux/topology.h>
+#include <linux/prctl.h>
+#include <linux/ptrace_api.h>
+#include <linux/gfp_api.h>
+#include <linux/sched/clock.h>
+#include <linux/workqueue_api.h>
+#include <linux/tick.h>
+#include <linux/ktime_api.h>
+#include <linux/swait_api.h>
+#include <linux/hashtable_api.h>
+#include <linux/cpufreq.h>
+#include <linux/utsname.h>
 
+#include <asm/irq_regs.h>
+
+#include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/coredump.h>
@@ -24,6 +42,7 @@
 #include <linux/sched/nohz.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/prio.h>
+#include <linux/sched/rseq_api.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/smt.h>
@@ -31,6 +50,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/thread_info_api.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/user.h>
 #include <linux/sched/wake_q.h>
@@ -69,8 +89,7 @@
 #include <linux/syscalls.h>
 #include <linux/task_work.h>
 #include <linux/tsacct_kern.h>
-
-#include <asm/tlb.h>
+#include <linux/hrtimer_api.h>
 
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
@@ -87,6 +106,40 @@
 # define SCHED_WARN_ON(x)	({ (void)(x), 0; })
 #endif
 
+#include <linux/bitmap.h>
+#include <linux/capability.h>
+#include <linux/cgroup.h>
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/sched/affinity.h>
+#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/syscalls_api.h>
+#include <linux/syscalls.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <trace/events/power.h>
+
+#include "../workqueue_internal.h"
+
 struct rq;
 struct cpuidle_state;
 
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 07dde2928c79..857f837f52cb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,7 +2,6 @@
 /*
  * /proc/schedstat implementation
  */
-#include "sched.h"
 
 void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
 			       struct sched_statistics *stats)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 0b165a25f22f..d04073a93eb4 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,7 +7,6 @@
  *
  * See kernel/stop_machine.c
  */
-#include "sched.h"
 
 #ifdef CONFIG_SMP
 static int
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..76b9b796e695 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,7 +2,6 @@
 /*
  * <linux/swait.h> (simple wait queues ) implementation:
  */
-#include "sched.h"
 
 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
 			     struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32841c6741d1..e8af72fd70bd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,7 +2,6 @@
 /*
  * Scheduler topology setup/handling methods
  */
-#include "sched.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index eca38107b32f..9860bb9a847c 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,7 +4,6 @@
  *
  * (C) 2004 Nadia Yvette Chambers, Oracle
  */
-#include "sched.h"
 
 void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
 {
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..d4788f810b55 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-only
+
 /*
  * The implementation of the wait_bit*() and related waiting APIs:
  */
-#include "sched.h"
 
 #define WAIT_TABLE_BITS 8
 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-- 
cgit v1.2.3


From f96eca432015ddc1b621632488ebc345bca06791 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 22 Feb 2022 13:46:03 +0100
Subject: sched/headers: Introduce kernel/sched/build_policy.c and build
 multiple .c files there

Similarly to kernel/sched/build_utility.c, collect all 'scheduling policy' related
source code files into kernel/sched/build_policy.c:

    kernel/sched/idle.c

    kernel/sched/rt.c

    kernel/sched/cpudeadline.c
    kernel/sched/pelt.c

    kernel/sched/cputime.c
    kernel/sched/deadline.c

With the exception of fair.c, which we continue to build as a separate file
for build efficiency and parallelism reasons.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/Makefile       |  2 +-
 kernel/sched/build_policy.c | 29 +++++++++++++++++++++++++++++
 kernel/sched/cpudeadline.c  |  1 -
 kernel/sched/cputime.c      |  1 -
 kernel/sched/deadline.c     |  2 --
 kernel/sched/idle.c         |  3 ---
 kernel/sched/pelt.c         |  4 ----
 kernel/sched/rt.c           |  3 ---
 kernel/sched/sched.h        |  8 ++++++++
 9 files changed, 38 insertions(+), 15 deletions(-)
 create mode 100644 kernel/sched/build_policy.c

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 4a4785cb3cd2..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -30,5 +30,5 @@ endif
 #
 obj-y += core.o
 obj-y += fair.o
+obj-y += build_policy.o
 obj-y += build_utility.o
-obj-y += idle.o rt.o deadline.o cputime.o cpudeadline.o pelt.o
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..6bb384ddcfab
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ *   comparable to core.c and fair.c, the other two big
+ *   compilation units. This helps balance build time, while
+ *   coalescing source files to amortize header inclusion
+ *   cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+#include "sched.h"
+#include "pelt.h"
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 0e196f0de492..02d970a879ed 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -6,7 +6,6 @@
  *
  *  Author: Juri Lelli <j.lelli@sssup.it>
  */
-#include "sched.h"
 
 static inline int parent(int i)
 {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b7ec42732b28..78a233d43757 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,6 @@
 /*
  * Simple CPU accounting cgroup controller
  */
-#include "sched.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d2c072b0ef01..fca2d7de4d3d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,8 +15,6 @@
  *                    Michael Trimarchi <michael@amarulasolutions.com>,
  *                    Fabio Checconi <fchecconi@gmail.com>
  */
-#include "sched.h"
-#include "pelt.h"
 
 struct dl_bandwidth def_dl_bandwidth;
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d17b0a5ce6ac..8f8b5020e76a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,9 +6,6 @@
  * (NOTE: these are not related to SCHED_IDLE batch scheduled
  *        tasks which are handled in sched/fair.c )
  */
-#include "sched.h"
-
-#include <trace/events/power.h>
 
 /* Linker adds these: start and end of __cpuidle functions */
 extern char __cpuidle_text_start[], __cpuidle_text_end[];
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a554e3bbab2b..0f310768260c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -24,10 +24,6 @@
  *  Author: Vincent Guittot <vincent.guittot@linaro.org>
  */
 
-#include <linux/sched.h>
-#include "sched.h"
-#include "pelt.h"
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7b4f4fbbb404..ff4c044aed12 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,9 +3,6 @@
  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  * policies)
  */
-#include "sched.h"
-
-#include "pelt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 79c7a8a2be65..f7d10b98e911 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -13,6 +13,7 @@
 #include <linux/prctl.h>
 #include <linux/ptrace_api.h>
 #include <linux/gfp_api.h>
+#include <linux/posix-timers.h>
 #include <linux/sched/clock.h>
 #include <linux/workqueue_api.h>
 #include <linux/tick.h>
@@ -113,8 +114,11 @@
 #include <linux/cpumask_api.h>
 #include <linux/ctype.h>
 #include <linux/file.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
 #include <linux/jiffies.h>
 #include <linux/kref_api.h>
+#include <linux/ktime_api.h>
 #include <linux/lockdep_api.h>
 #include <linux/module.h>
 #include <linux/mutex_api.h>
@@ -126,12 +130,16 @@
 #include <linux/sched/loadavg.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/rseq_api.h>
+#include <linux/sched/signal.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
 #include <linux/syscalls_api.h>
 #include <linux/syscalls.h>
 #include <linux/topology.h>
 #include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
 #include <linux/uaccess.h>
 #include <linux/wait_api.h>
 #include <linux/workqueue_api.h>
-- 
cgit v1.2.3


From b9e9c6ca6e54b5d58a57663f76c5cb33c12ea98f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 13 Feb 2022 08:19:43 +0100
Subject: sched/headers: Standardize kernel/sched/sched.h header dependencies

kernel/sched/sched.h is a weird mix of ad-hoc headers included
in the middle of the header.

Two of them rely on being included in the middle of kernel/sched/sched.h,
due to definitions they require:

 - "stat.h" needs the rq definitions.
 - "autogroup.h" needs the task_group definition.

Move the inclusion of these two files out of kernel/sched/sched.h, and
include them in all files that require them.

Move of the rest of the header dependencies to the top of the
kernel/sched/sched.h file.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/build_policy.c  |  3 +++
 kernel/sched/build_utility.c |  2 ++
 kernel/sched/core.c          |  2 ++
 kernel/sched/fair.c          |  2 ++
 kernel/sched/sched.h         | 20 +++++++++++---------
 kernel/sched/stats.h         |  4 ++--
 6 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 6bb384ddcfab..9a169b2f97e6 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -13,6 +13,9 @@
  */
 
 #include "sched.h"
+
+#include "autogroup.h"
+#include "stats.h"
 #include "pelt.h"
 
 #include "idle.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 31216ce0b4b3..8b2b199983bf 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -12,6 +12,8 @@
 
 #include "sched.h"
 #include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
 
 #include <linux/sched_clock.h>
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef946123e9af..22de53710ee9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,8 @@
 #undef CREATE_TRACE_POINTS
 
 #include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
 
 #include <linux/nospec.h>
 #include <linux/blkdev.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 16874e112fe6..7391c9287503 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -21,6 +21,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 #include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f7d10b98e911..f255ec2afeca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,11 +96,20 @@
 # include <asm/paravirt.h>
 #endif
 
+#include <trace/events/sched.h>
+
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+#include <linux/psi.h>
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+#endif
+
 #include "cpupri.h"
 #include "cpudeadline.h"
 
-#include <trace/events/sched.h>
-
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
@@ -417,9 +426,6 @@ extern bool dl_cpu_busy(unsigned int cpu);
 
 #ifdef CONFIG_CGROUP_SCHED
 
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-
 struct cfs_rq;
 struct rt_rq;
 
@@ -1919,9 +1925,6 @@ extern void flush_smp_call_function_from_idle(void);
 static inline void flush_smp_call_function_from_idle(void) { }
 #endif
 
-#include "stats.h"
-#include "autogroup.h"
-
 #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
 
 extern void __sched_core_account_forceidle(struct rq *rq);
@@ -2016,7 +2019,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
 # define const_debug __read_mostly
 #else
 # define const_debug const
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index edc0d13fc61b..a0df79e6038c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,10 +2,10 @@
 #ifndef _KERNEL_STATS_H
 #define _KERNEL_STATS_H
 
-#ifdef CONFIG_SCHEDSTATS
-
 #include "sched.h"
 
+#ifdef CONFIG_SCHEDSTATS
+
 extern struct static_key_false sched_schedstats;
 
 /*
-- 
cgit v1.2.3


From e66f6481a8c748ce2d4b37a3d5e10c4dd0d65e80 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 23 Feb 2022 08:17:15 +0100
Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/core.c
 dependencies

Use all generic headers from kernel/sched/sched.h that are required
for it to build.

Sort the sections alphabetically.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22de53710ee9..5eaa4211d93c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,7 +6,73 @@
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  */
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
+
+#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
+#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/sched/wake_q.h>
+#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# include <linux/entry-common.h>
+#endif
+
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+
 #define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
 #include <trace/events/sched.h>
 #undef CREATE_TRACE_POINTS
 
@@ -14,22 +80,15 @@
 #include "stats.h"
 #include "autogroup.h"
 
-#include <linux/nospec.h>
-#include <linux/blkdev.h>
-#include <linux/jump_label.h>
-#include <linux/kcov.h>
-#include <linux/scs.h>
-
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
+#include "autogroup.h"
+#include "pelt.h"
+#include "smp.h"
+#include "stats.h"
 
 #include "../workqueue_internal.h"
 #include "../../fs/io-wq.h"
 #include "../smpboot.h"
 
-#include "pelt.h"
-#include "smp.h"
-
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
  * associated with them) to allow external modules to probe them.
-- 
cgit v1.2.3


From c4ad6fcb67c42d65481c85733c8009c8afdfdf4e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 22 Feb 2022 13:50:01 +0100
Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/fair.c
 dependencies

Use all generic headers from kernel/sched/sched.h that are required
for it to build.

Sort the sections alphabetically.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7391c9287503..89d21fda106c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,6 +20,35 @@
  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+
+#include <asm/switch_to.h>
+
+#include <linux/sched/cond_resched.h>
+
 #include "sched.h"
 #include "stats.h"
 #include "autogroup.h"
-- 
cgit v1.2.3


From 0dda4eeb484962ad574fcea77a78a4fbd7bc06ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 21 Jun 2021 10:33:56 +0200
Subject: sched/headers: Reorganize, clean up and optimize
 kernel/sched/build_policy.c dependencies

Use all generic headers from kernel/sched/sched.h that are required
for it to build.

Sort the sections alphabetically.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/build_policy.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 9a169b2f97e6..a2e4023771a0 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -12,12 +12,36 @@
  * core.c and fair.c are built separately.
  */
 
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/livepatch.h>
+#include <linux/psi.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+
+#ifdef CONFIG_PARAVIRT
+# include <asm/paravirt.h>
+#endif
+
+#include <uapi/linux/sched/types.h>
+
 #include "sched.h"
 
 #include "autogroup.h"
 #include "stats.h"
 #include "pelt.h"
 
+/* Source code modules: */
+
 #include "idle.c"
 
 #include "rt.c"
-- 
cgit v1.2.3


From e81daa7b6489e9810fa699c5104c6fd500e64fb8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 19 Jul 2021 12:43:51 +0200
Subject: sched/headers: Reorganize, clean up and optimize
 kernel/sched/build_utility.c dependencies

Use all generic headers from kernel/sched/sched.h that are required
for it to build.

Sort the sections alphabetically.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/build_utility.c | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 8b2b199983bf..bc2f4d648209 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -9,14 +9,53 @@
  *   coalescing source files to amortize header inclusion
  *   cost. )
  */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PARAVIRT
+# include <asm/paravirt.h>
+#endif
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
 
 #include "sched.h"
 #include "sched-pelt.h"
 #include "stats.h"
 #include "autogroup.h"
 
-#include <linux/sched_clock.h>
-
 #include "clock.c"
 
 #ifdef CONFIG_CGROUP_CPUACCT
-- 
cgit v1.2.3


From 4ff8f2ca6ccd9e0cc5665d09f86d631b3ae3a14c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 22 Feb 2022 14:51:58 +0100
Subject: sched/headers: Reorganize, clean up and optimize kernel/sched/sched.h
 dependencies

Remove all headers, except the ones required to make this header
build standalone.

Also include stats.h in sched.h explicitly - dependencies already
require this.

Summary of the build speedup gained through the last ~15 scheduler build &
header dependency patches:

Cumulative scheduler (kernel/sched/) build time speedup on a
Linux distribution's config, which enables all scheduler features,
compared to the vanilla kernel:

  _____________________________________________________________________________
 |
 |  Vanilla kernel (v5.13-rc7):
 |_____________________________________________________________________________
 |
 |  Performance counter stats for 'make -j96 kernel/sched/' (3 runs):
 |
 |   126,975,564,374      instructions              #    1.45  insn per cycle           ( +-  0.00% )
 |    87,637,847,671      cycles                    #    3.959 GHz                      ( +-  0.30% )
 |         22,136.96 msec cpu-clock                 #    7.499 CPUs utilized            ( +-  0.29% )
 |
 |            2.9520 +- 0.0169 seconds time elapsed  ( +-  0.57% )
 |_____________________________________________________________________________
 |
 |  Patched kernel:
 |_____________________________________________________________________________
 |
 | Performance counter stats for 'make -j96 kernel/sched/' (3 runs):
 |
 |    50,420,496,914      instructions              #    1.47  insn per cycle           ( +-  0.00% )
 |    34,234,322,038      cycles                    #    3.946 GHz                      ( +-  0.31% )
 |          8,675.81 msec cpu-clock                 #    3.053 CPUs utilized            ( +-  0.45% )
 |
 |            2.8420 +- 0.0181 seconds time elapsed  ( +-  0.64% )
 |_____________________________________________________________________________

Summary:

  - CPU time used to build the scheduler dropped by -60.9%, a reduction
    from 22.1 clock-seconds to 8.7 clock-seconds.

  - Wall-clock time to build the scheduler dropped by -3.9%, a reduction
    from 2.95 seconds to 2.84 seconds.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/build_policy.c  |   4 --
 kernel/sched/build_utility.c |   6 +-
 kernel/sched/sched.h         | 147 +++++++++++++------------------------------
 kernel/sched/stats.h         |   2 -
 4 files changed, 46 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index a2e4023771a0..e0104b45029a 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -28,10 +28,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/vtime.h>
 
-#ifdef CONFIG_PARAVIRT
-# include <asm/paravirt.h>
-#endif
-
 #include <uapi/linux/sched/types.h>
 
 #include "sched.h"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index bc2f4d648209..eec0849b2aae 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -44,13 +44,11 @@
 #include <linux/wait_api.h>
 #include <linux/workqueue_api.h>
 
-#ifdef CONFIG_PARAVIRT
-# include <asm/paravirt.h>
-#endif
-
 #include <uapi/linux/prctl.h>
 #include <uapi/linux/sched/types.h>
 
+#include <asm/switch_to.h>
+
 #include "sched.h"
 #include "sched-pelt.h"
 #include "stats.h"
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f255ec2afeca..0d4217965561 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,158 +5,97 @@
 #ifndef _KERNEL_SCHED_SCHED_H
 #define _KERNEL_SCHED_SCHED_H
 
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/ktime_api.h>
-#include <linux/cgroup_api.h>
-#include <linux/topology.h>
-#include <linux/prctl.h>
-#include <linux/ptrace_api.h>
-#include <linux/gfp_api.h>
-#include <linux/posix-timers.h>
-#include <linux/sched/clock.h>
-#include <linux/workqueue_api.h>
-#include <linux/tick.h>
-#include <linux/ktime_api.h>
-#include <linux/swait_api.h>
-#include <linux/hashtable_api.h>
-#include <linux/cpufreq.h>
-#include <linux/utsname.h>
-
-#include <asm/irq_regs.h>
-
 #include <linux/sched/affinity.h>
 #include <linux/sched/autogroup.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/coredump.h>
 #include <linux/sched/cpufreq.h>
-#include <linux/sched/cputime.h>
 #include <linux/sched/deadline.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/idle.h>
-#include <linux/sched/init.h>
-#include <linux/sched/isolation.h>
-#include <linux/sched/jobctl.h>
+#include <linux/sched.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/mm.h>
-#include <linux/sched/nohz.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/prio.h>
 #include <linux/sched/rseq_api.h>
-#include <linux/sched/rt.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/smt.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/sysctl.h>
+#include <linux/sched/task_flags.h>
 #include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/sched/thread_info_api.h>
 #include <linux/sched/topology.h>
-#include <linux/sched/user.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/xacct.h>
-
-#include <uapi/linux/sched/types.h>
-
-#include <linux/binfmts.h>
-#include <linux/bitops.h>
-#include <linux/compat.h>
-#include <linux/context_tracking.h>
-#include <linux/cpufreq.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuset.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/delayacct.h>
-#include <linux/energy_model.h>
-#include <linux/init_task.h>
-#include <linux/kprobes.h>
-#include <linux/kthread.h>
-#include <linux/membarrier.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/nmi.h>
-#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
-#include <linux/profile.h>
-#include <linux/psi.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/security.h>
-#include <linux/stop_machine.h>
-#include <linux/suspend.h>
-#include <linux/swait.h>
-#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/tsacct_kern.h>
-#include <linux/hrtimer_api.h>
-
-#ifdef CONFIG_PARAVIRT
-# include <asm/paravirt.h>
-#endif
-
-#include <trace/events/sched.h>
-
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-#endif
-
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-#endif
-
-#include "cpupri.h"
-#include "cpudeadline.h"
-
-#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
-#else
-# define SCHED_WARN_ON(x)	({ (void)(x), 0; })
-#endif
 
+#include <linux/atomic.h>
 #include <linux/bitmap.h>
+#include <linux/bug.h>
 #include <linux/capability.h>
+#include <linux/cgroup_api.h>
 #include <linux/cgroup.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask_api.h>
 #include <linux/ctype.h>
 #include <linux/file.h>
+#include <linux/fs_api.h>
 #include <linux/hrtimer_api.h>
 #include <linux/interrupt.h>
+#include <linux/irq_work.h>
 #include <linux/jiffies.h>
 #include <linux/kref_api.h>
+#include <linux/kthread.h>
 #include <linux/ktime_api.h>
 #include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex_api.h>
+#include <linux/plist.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
+#include <linux/profile.h>
 #include <linux/psi.h>
-#include <linux/sched/affinity.h>
-#include <linux/sched.h>
-#include <linux/sched/loadavg.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/rseq_api.h>
-#include <linux/sched/signal.h>
+#include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/softirq.h>
 #include <linux/spinlock_api.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
 #include <linux/syscalls_api.h>
 #include <linux/syscalls.h>
+#include <linux/tick.h>
 #include <linux/topology.h>
 #include <linux/types.h>
 #include <linux/u64_stats_sync_api.h>
 #include <linux/uaccess.h>
 #include <linux/wait_api.h>
+#include <linux/wait_bit.h>
 #include <linux/workqueue_api.h>
 
 #include <trace/events/power.h>
+#include <trace/events/sched.h>
 
 #include "../workqueue_internal.h"
 
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+#include <linux/psi.h>
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+#endif
+
+#ifdef CONFIG_PARAVIRT
+# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
+#endif
+
+#include "cpupri.h"
+#include "cpudeadline.h"
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SCHED_WARN_ON(x)      WARN_ONCE(x, #x)
+#else
+# define SCHED_WARN_ON(x)      ({ (void)(x), 0; })
+#endif
+
 struct rq;
 struct cpuidle_state;
 
@@ -1925,6 +1864,8 @@ extern void flush_smp_call_function_from_idle(void);
 static inline void flush_smp_call_function_from_idle(void) { }
 #endif
 
+#include "stats.h"
+
 #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
 
 extern void __sched_core_account_forceidle(struct rq *rq);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index a0df79e6038c..baa839c1ba96 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -2,8 +2,6 @@
 #ifndef _KERNEL_STATS_H
 #define _KERNEL_STATS_H
 
-#include "sched.h"
-
 #ifdef CONFIG_SCHEDSTATS
 
 extern struct static_key_false sched_schedstats;
-- 
cgit v1.2.3


From f2eb478f2f322217aa642e11c1cc011f99c797e6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Feb 2022 08:07:13 +0100
Subject: kernfs: move struct kernfs_root out of the public view.

There is no need to have struct kernfs_root be part of kernfs.h for
the whole kernel to see and poke around it.  Move it internal to kernfs
code and provide a helper function, kernfs_root_to_node(), to handle the
one field that kernfs users were directly accessing from the structure.

Cc: Imran Khan <imran.f.khan@oracle.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220222070713.3517679-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b31e1465868a..a800c3b1b795 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
-	struct cgroup *root_cgrp = kf_root->kn->priv;
+	struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
 
 	return root_cgrp->root;
 }
@@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 		ret = PTR_ERR(root->kf_root);
 		goto exit_root_id;
 	}
-	root_cgrp->kn = root->kf_root->kn;
+	root_cgrp->kn = kernfs_root_to_node(root->kf_root);
 	WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
 	root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
 
-- 
cgit v1.2.3


From c561d11063009323a0e57c528cb1d77b7d2c41e0 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sun, 20 Feb 2022 10:40:55 -0800
Subject: bpf: Cleanup comments

Add leading space to spdx tag
Use // for spdx c file comment

Replacements
resereved to reserved
inbetween to in between
everytime to every time
intutivie to intuitive
currenct to current
encontered to encountered
referenceing to referencing
upto to up to
exectuted to executed

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20220220184055.3608317-1-trix@redhat.com
---
 kernel/bpf/bpf_local_storage.c | 2 +-
 kernel/bpf/btf.c               | 6 +++---
 kernel/bpf/cgroup.c            | 8 ++++----
 kernel/bpf/hashtab.c           | 2 +-
 kernel/bpf/helpers.c           | 2 +-
 kernel/bpf/local_storage.c     | 2 +-
 kernel/bpf/reuseport_array.c   | 2 +-
 kernel/bpf/syscall.c           | 2 +-
 kernel/bpf/trampoline.c        | 2 +-
 9 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 71de2a89869c..092a1ac772d7 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 		 * will be done by the caller.
 		 *
 		 * Although the unlock will be done under
-		 * rcu_read_lock(),  it is more intutivie to
+		 * rcu_read_lock(),  it is more intuitive to
 		 * read if the freeing of the storage is done
 		 * after the raw_spin_unlock_bh(&local_storage->lock).
 		 *
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2c4c5dbe2abe..0890e56e8b08 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2018 Facebook */
 
 #include <uapi/linux/btf.h>
@@ -2547,7 +2547,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	 *
 	 * We now need to continue from the last-resolved-ptr to
 	 * ensure the last-resolved-ptr will not referring back to
-	 * the currenct ptr (t).
+	 * the current ptr (t).
 	 */
 	if (btf_type_is_modifier(next_type)) {
 		const struct btf_type *resolved_type;
@@ -6148,7 +6148,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
 
 	btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
 
-	/* If we encontered an error, return it. */
+	/* If we encountered an error, return it. */
 	if (ssnprintf.show.state.status)
 		return ssnprintf.show.state.status;
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 098632fdbc45..128028efda64 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
  * @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
  *
  * If no socket is passed, or the socket is not of type INET or INET6,
  * this function does nothing and returns 0.
@@ -1094,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 /**
  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
  * @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
  *
  * socket is passed is expected to be of type INET or INET6.
  *
@@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  *                                       provided by user sockaddr
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
  * @t_ctx: Pointer to attach type specific context
  * @flags: Pointer to u32 which contains higher bits of BPF program
  *         return value (OR'ed together).
@@ -1166,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
  * sk with connection information (IP addresses, etc.) May not contain
  * cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
  *
  * socket passed is expected to be of type INET or INET6.
  *
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d29af9988f37..65877967f414 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 		value_size = size * num_possible_cpus();
 	total = 0;
 	/* while experimenting with hash tables with sizes ranging from 10 to
-	 * 1000, it was observed that a bucket can have upto 5 entries.
+	 * 1000, it was observed that a bucket can have up to 5 entries.
 	 */
 	bucket_size = 5;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 49817755b8c3..ae64110a98b5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1093,7 +1093,7 @@ struct bpf_hrtimer {
 struct bpf_timer_kern {
 	struct bpf_hrtimer *timer;
 	/* bpf_spin_lock is used here instead of spinlock_t to make
-	 * sure that it always fits into space resereved by struct bpf_timer
+	 * sure that it always fits into space reserved by struct bpf_timer
 	 * regardless of LOCKDEP and spinlock debug flags.
 	 */
 	struct bpf_spin_lock lock;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 23f7f9d08a62..497916060ac7 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf-cgroup.h>
 #include <linux/bpf.h>
 #include <linux/bpf_local_storage.h>
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 556a769b5b80..8251243022a2 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map)
 
 	/*
 	 * Once reaching here, all sk->sk_user_data is not
-	 * referenceing this "array".  "array" can be freed now.
+	 * referencing this "array". "array" can be freed now.
 	 */
 	bpf_map_area_free(array);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9c7a72b65eee..db402ebc5570 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2566,7 +2566,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
  * pre-allocated resources are to be freed with bpf_cleanup() call. All the
  * transient state is passed around in struct bpf_link_primer.
  * This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
  * itself and attaching it to BPF hook. By using bpf_link_prime() and
  * bpf_link_settle() kernel code using bpf_link doesn't have to perform
  * expensive (and potentially failing) roll back operations in a rare case
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7224691df2ec..0b41fa993825 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -45,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void)
 
 	set_vm_flush_reset_perms(image);
 	/* Keep image as writeable. The alternative is to keep flipping ro/rw
-	 * everytime new program is attached or detached.
+	 * every time new program is attached or detached.
 	 */
 	set_memory_x((long)image, 1);
 	return image;
-- 
cgit v1.2.3


From 9f8e5aee93ed2482638d577a56806b455084b595 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 14 Feb 2022 12:00:59 -0500
Subject: tracing: Fix allocation of last_cmd in last_cmd_set()

The strncat() used in last_cmd_set() includes the nul byte of length of
the string being copied in, when it should only hold the size of the
string being copied (not the nul byte). Change it to subtract the length
of the allocated space and the nul byte to pass that into the strncat().

Also, assign "len" instead of initializing it to zero and its first update
is to do a "+=".

Link: https://lore.kernel.org/all/202202140628.fj6e4w4v-lkp@intel.com/

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5e8970624bce..78788049f3d3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -744,19 +744,20 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 {
 	const char *system = NULL, *name = NULL;
 	struct trace_event_call *call;
-	int len = 0;
+	int len;
 
 	if (!str)
 		return;
 
-	len += sizeof(HIST_PREFIX) + strlen(str) + 1;
+	len = sizeof(HIST_PREFIX) + strlen(str) + 1;
 	kfree(last_cmd);
 	last_cmd = kzalloc(len, GFP_KERNEL);
 	if (!last_cmd)
 		return;
 
 	strcpy(last_cmd, HIST_PREFIX);
-	strncat(last_cmd, str, len - sizeof(HIST_PREFIX));
+	len -= sizeof(HIST_PREFIX) + 1;
+	strncat(last_cmd, str, len);
 
 	if (file) {
 		call = file->event_call;
-- 
cgit v1.2.3


From 967747bbc084b93b54e66f9047d342232314cd25 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 11 Feb 2022 21:42:45 +0100
Subject: uaccess: remove CONFIG_SET_FS

There are no remaining callers of set_fs(), so CONFIG_SET_FS
can be removed globally, along with the thread_info field and
any references to it.

This turns access_ok() into a cheaper check against TASK_SIZE_MAX.

As CONFIG_SET_FS is now gone, drop all remaining references to
set_fs()/get_fs(), mm_segment_t, user_addr_max() and uaccess_kernel().

Acked-by: Sam Ravnborg <sam@ravnborg.org> # for sparc32 changes
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Tested-by: Sergey Matyukevich <sergey.matyukevich@synopsys.com> # for arc changes
Acked-by: Stafford Horne <shorne@gmail.com> # [openrisc, asm-generic]
Acked-by: Dinh Nguyen <dinguyen@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/events/callchain.c |  4 ----
 kernel/events/core.c      |  3 ---
 kernel/exit.c             | 14 --------------
 kernel/kthread.c          |  5 -----
 kernel/stacktrace.c       |  3 ---
 kernel/trace/bpf_trace.c  |  4 ----
 6 files changed, 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 		}
 
 		if (regs) {
-			mm_segment_t fs;
-
 			if (crosstask)
 				goto exit_put;
 
 			if (add_mark)
 				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
-			fs = force_uaccess_begin();
 			perf_callchain_user(&ctx, regs);
-			force_uaccess_end(fs);
 		}
 	}
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57c7197838db..11ca7303d6df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6746,7 +6746,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 		unsigned long sp;
 		unsigned int rem;
 		u64 dyn_size;
-		mm_segment_t fs;
 
 		/*
 		 * We dump:
@@ -6764,9 +6763,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 
 		/* Data. */
 		sp = perf_user_stack_pointer(regs);
-		fs = force_uaccess_begin();
 		rem = __output_copy_user(handle, (void *) sp, dump_size);
-		force_uaccess_end(fs);
 		dyn_size = dump_size - rem;
 
 		perf_output_skip(handle, rem);
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..0884a75bc2f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -737,20 +737,6 @@ void __noreturn do_exit(long code)
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
-	/*
-	 * If do_dead is called because this processes oopsed, it's possible
-	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
-	 * continuing. Amongst other possible reasons, this is to prevent
-	 * mm_release()->clear_child_tid() from writing to a user-controlled
-	 * kernel address.
-	 *
-	 * On uptodate architectures force_uaccess_begin is a noop.  On
-	 * architectures that still have set_fs/get_fs in addition to handling
-	 * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
-	 * default.
-	 */
-	force_uaccess_begin();
-
 	kcov_task_exit(tsk);
 
 	coredump_task_exit(tsk);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 38c6dd822da8..16c2275d4b50 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ struct kthread {
 	int result;
 	int (*threadfn)(void *);
 	void *data;
-	mm_segment_t oldfs;
 	struct completion parked;
 	struct completion exited;
 #ifdef CONFIG_BLK_CGROUP
@@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm)
 		mmdrop(active_mm);
 	else
 		smp_mb();
-
-	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
 EXPORT_SYMBOL_GPL(kthread_use_mm);
 
@@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(!tsk->mm);
 
-	force_uaccess_end(to_kthread(tsk)->oldfs);
-
 	task_lock(tsk);
 	/*
 	 * When a kthread stops operating on an address space, the loop
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c625257023d..9ed5ce989415 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 		.store	= store,
 		.size	= size,
 	};
-	mm_segment_t fs;
 
 	/* Trace user stack if not a kernel thread */
 	if (current->flags & PF_KTHREAD)
 		return 0;
 
-	fs = force_uaccess_begin();
 	arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
-	force_uaccess_end(fs);
 
 	return c.len;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..8115fff17018 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -332,8 +332,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
 	if (unlikely(in_interrupt() ||
 		     current->flags & (PF_KTHREAD | PF_EXITING)))
 		return -EPERM;
-	if (unlikely(uaccess_kernel()))
-		return -EPERM;
 	if (unlikely(!nmi_uaccess_okay()))
 		return -EPERM;
 
@@ -835,8 +833,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
 	 */
 	if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
 		return -EPERM;
-	if (unlikely(uaccess_kernel()))
-		return -EPERM;
 	if (unlikely(!nmi_uaccess_okay()))
 		return -EPERM;
 
-- 
cgit v1.2.3


From 4c4559b43c5fbe1af5b55212c3f411781da2093d Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Wed, 2 Feb 2022 11:05:27 +0100
Subject: config: android-recommended: Don't explicitly disable CONFIG_AIO

Android nowadays (for a couple years already) requires AIO for at least
its `adb` "Android Debug Bridge" [1].  Without this config option
(`default y`) it simply refuses start, making users unable to connect to
their phone for debugging purposes when using these kernel fragments.

[1]: https://cs.android.com/android/_/android/platform/packages/modules/adb/+/a2cb8de5e68067a5e1d002886d5f3b42d91371e1

Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Link: https://lore.kernel.org/r/20220202100528.190794-1-marijn.suijten@somainline.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-recommended.config | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index eb0029c9a6a6..22bd76e43aca 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,5 +1,4 @@
 #  KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_LEGACY_PTYS is not set
-- 
cgit v1.2.3


From 3bdd6d5ad5f8f6719cc761127df4af64ff163c79 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Wed, 2 Feb 2022 11:05:28 +0100
Subject: config: android-recommended: Disable BPF_UNPRIV_DEFAULT_OFF for netd

AOSP's `netd` process fails to start on Android S:

    E ClatdController: getClatEgress4MapFd() failure: Operation not permitted
    I netd    : Initializing ClatdController: 410us
    E netd    : Failed to start trafficcontroller: (Status[code: 1, msg: "Pinned map not accessible or does not exist: (/sys/fs/bpf/map_netd_cookie_tag_map): Operation not permitted"])
    E netd    : CRITICAL: sleeping 60 seconds, netd exiting with failure, crash loop likely!

And on Android R:

    I ClatdController: 4.9+ kernel and device shipped with P - clat ebpf might work.
    E ClatdController: getClatEgressMapFd() failure: Operation not permitted
    I netd    : Initializing ClatdController: 1409us
    E netd    : Failed to start trafficcontroller: (Status[code: 1, msg: "Pinned map not accessible or does not exist: (/sys/fs/bpf/map_netd_cookie_tag_map): Operation not permitted"])

These permission issues are caused by 08389d888287 ("bpf: Add kconfig
knob for disabling unpriv bpf by default") because AOSP does not provide
netd the `SYS_ADMIN` capability, and also has no userspace support for
the `BPF` capability yet.

Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: John Stultz <john.stultz@linaro.org>
[John suggested this in https://linaro.atlassian.net/browse/ACK-107?focusedCommentId=117382]
Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Link: https://lore.kernel.org/r/20220202100528.190794-2-marijn.suijten@somainline.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-recommended.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 22bd76e43aca..e400fbbc8aba 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,4 +1,5 @@
 #  KEEP ALPHABETICALLY SORTED
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_LEGACY_PTYS is not set
-- 
cgit v1.2.3


From 80bebebdac935473568c27d4f1349dc8f9809bf7 Mon Sep 17 00:00:00 2001
From: Yucong Sun <fallentree@fb.com>
Date: Fri, 25 Feb 2022 10:59:24 -0800
Subject: bpf: Fix issue with bpf preload module taking over stdout/stdin of
 kernel.

In cb80ddc67152 ("bpf: Convert bpf_preload.ko to use light skeleton.")
BPF preload was switched from user mode process to use in-kernel light
skeleton instead. However, in the kernel context, early in the boot
sequence, the first available FD can start from 0, instead of normally
3 for user mode process. So FDs 0 and 1 are then used for loaded BPF
programs and prevent init process from setting up stdin/stdout/stderr on
FD 0, 1, and 2 as expected.

Before the fix:

ls -lah /proc/1/fd/*

lrwx------1 root root 64 Feb 23 17:20 /proc/1/fd/0 -> /dev/null
lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/1 -> /dev/null
lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/2 -> /dev/console
lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/6 -> /dev/console
lrwx------ 1 root root 64 Feb 23 17:20 /proc/1/fd/7 -> /dev/console

After the fix:

ls -lah /proc/1/fd/*

lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/0 -> /dev/console
lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/1 -> /dev/console
lrwx------ 1 root root 64 Feb 24 21:23 /proc/1/fd/2 -> /dev/console

Fix by closing prog FDs after initialization. struct bpf_prog's
themselves are kept alive through direct kernel references taken with
bpf_link_get_from_fd().

Fixes: cb80ddc67152 ("bpf: Convert bpf_preload.ko to use light skeleton.")
Signed-off-by: Yucong Sun <fallentree@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220225185923.2535519-1-fallentree@fb.com
---
 kernel/bpf/preload/bpf_preload_kern.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 30207c048d36..5106b5372f0c 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -54,6 +54,13 @@ static int load_skel(void)
 		err = PTR_ERR(progs_link);
 		goto out;
 	}
+	/* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+	 * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+	 */
+	close_fd(skel->links.dump_bpf_map_fd);
+	skel->links.dump_bpf_map_fd = 0;
+	close_fd(skel->links.dump_bpf_prog_fd);
+	skel->links.dump_bpf_prog_fd = 0;
 	return 0;
 out:
 	free_links_and_skel();
-- 
cgit v1.2.3


From 5e214f2e43e453d862ebbbd2a4f7ee3fe650f209 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Wed, 23 Feb 2022 01:28:14 +0000
Subject: bpf: Add config to allow loading modules with BTF mismatches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BTF mismatch can occur for a separately-built module even when the ABI is
otherwise compatible and nothing else would prevent successfully loading.

Add a new Kconfig to control how mismatches are handled. By default, preserve
the current behavior of refusing to load the module. If MODULE_ALLOW_BTF_MISMATCH
is enabled, load the module but ignore its BTF information.

Suggested-by: Yonghong Song <yhs@fb.com>
Suggested-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Connor O'Brien <connoro@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/CAADnVQJ+OVPnBz8z3vNu8gKXX42jCUqfuvhWAyCQDu8N_yqqwQ@mail.gmail.com
Link: https://lore.kernel.org/bpf/20220223012814.1898677-1-connoro@google.com
---
 kernel/bpf/btf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0890e56e8b08..b472cf0c8fdb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6398,7 +6398,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
 			pr_warn("failed to validate module [%s] BTF: %ld\n",
 				mod->name, PTR_ERR(btf));
 			kfree(btf_mod);
-			err = PTR_ERR(btf);
+			if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+				err = PTR_ERR(btf);
 			goto out;
 		}
 		err = btf_alloc_id(btf);
-- 
cgit v1.2.3


From c6ced22997ad56a05377221bded7bb30973a62f2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 25 Jan 2022 10:44:08 -0500
Subject: tracing: Update print fmt check to handle new __get_sockaddr() macro

A helper macro was added to make reading socket addresses easier in trace
events. It pairs %pISpc with __get_sockaddr() that reads the socket
address from the ring buffer into a human readable format.

The boot up check that makes sure that trace events do not reference
pointers to memory that can later be freed when the trace event is read,
incorrectly flagged this as a delayed reference. Update the check to
handle "__get_sockaddr" and not report an error on it.

Link: https://lore.kernel.org/all/20220125160505.068dbb52@canb.auug.org.au/

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 kernel/trace/trace_events.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..f527ae807e77 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -384,6 +384,12 @@ static void test_event_printk(struct trace_event_call *call)
 			if (!(dereference_flags & (1ULL << arg)))
 				goto next_arg;
 
+			/* Check for __get_sockaddr */;
+			if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
+				dereference_flags &= ~(1ULL << arg);
+				goto next_arg;
+			}
+
 			/* Find the REC-> in the argument */
 			c = strchr(fmt + i, ',');
 			r = strstr(fmt + i, "REC->");
-- 
cgit v1.2.3


From f49169c97fceb21ad6a0aaf671c50b0f520f15a5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 16 Feb 2022 12:31:09 -0500
Subject: NFSD: Remove svc_serv_ops::svo_module

struct svc_serv_ops is about to be removed.

Neil Brown says:
> I suspect svo_module can go as well - I don't think the thread is
> ever the thing that primarily keeps a module active.

A random sample of kthread_create() callers shows sunrpc is the only
one that manages module reference count in this way.

Suggested-by: Neil Brown <neilb@suse.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 46a5c2ed1928..6cea788fd965 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -335,7 +335,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
 
 /*
  * A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit.  nfsd and lockd use this.
+ * is running can call this to safely exit.
  */
 void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
 {
-- 
cgit v1.2.3


From ceac059ed4fd8abc0940f799dd5133275011e244 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Wed, 23 Feb 2022 16:05:31 -0800
Subject: bpf: Cache the last valid build_id

For binaries that are statically linked, consecutive stack frames are
likely to be in the same VMA and therefore have the same build id.

On a real-world workload, we observed that 66% of CPU cycles in
__bpf_get_stackid() were spent on build_id_parse() and find_vma().

As an optimization for this case, we can cache the previous frame's
VMA, if the new frame has the same VMA as the previous one, reuse the
previous one's build id.

We are holding the MM locks as reader across the entire loop, so we
don't need to worry about VMA going away.

Tested through "stacktrace_build_id" and "stacktrace_build_id_nmi" in
test_progs.

Suggested-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/bpf/20220224000531.1265030-1-haoluo@google.com
---
 kernel/bpf/stackmap.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22c8ae94e4c1..38bdfcd06f55 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -132,7 +132,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	int i;
 	struct mmap_unlock_irq_work *work = NULL;
 	bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma, *prev_vma = NULL;
+	const char *prev_build_id;
 
 	/* If the irq_work is in use, fall back to report ips. Same
 	 * fallback is used for kernel stack (!user) on a stackmap with
@@ -150,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 
 	for (i = 0; i < trace_nr; i++) {
+		if (range_in_vma(prev_vma, ips[i], ips[i])) {
+			vma = prev_vma;
+			memcpy(id_offs[i].build_id, prev_build_id,
+			       BUILD_ID_SIZE_MAX);
+			goto build_id_valid;
+		}
 		vma = find_vma(current->mm, ips[i]);
 		if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
 			/* per entry fall back to ips */
@@ -158,9 +165,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
 			continue;
 		}
+build_id_valid:
 		id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+		prev_vma = vma;
+		prev_build_id = id_offs[i].build_id;
 	}
 	bpf_mmap_unlock_mm(work, current->mm);
 }
-- 
cgit v1.2.3


From b664e255ba3c655a675e4e1fe9503d6f7ada3305 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 22 Feb 2022 17:57:05 +0800
Subject: bpf: Add some description about BPF_JIT_ALWAYS_ON in Kconfig

When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
is permanently set to 1 and setting any other value than that will return
failure.

Add the above description in the help text of config BPF_JIT_ALWAYS_ON, and
then we can distinguish between BPF_JIT_ALWAYS_ON and BPF_JIT_DEFAULT_ON.

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/1645523826-18149-2-git-send-email-yangtiezhu@loongson.cn
---
 kernel/bpf/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d24d518ddd63..c3cf0b86eeb2 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -58,6 +58,10 @@ config BPF_JIT_ALWAYS_ON
 	  Enables BPF JIT and removes BPF interpreter to avoid speculative
 	  execution of BPF instructions by the interpreter.
 
+	  When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+	  is permanently set to 1 and setting any other value than that will
+	  return failure.
+
 config BPF_JIT_DEFAULT_ON
 	def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
 	depends on HAVE_EBPF_JIT && BPF_JIT
-- 
cgit v1.2.3


From 248cc9993d1cc12b8e9ed716cc3fc09f6c3517dd Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 20 Feb 2022 13:14:24 +0800
Subject: sched/cpuacct: Fix charge percpu cpuusage

The cpuacct_account_field() is always called by the current task
itself, so it's ok to use __this_cpu_add() to charge the tick time.

But cpuacct_charge() maybe called by update_curr() in load_balance()
on a random CPU, different from the CPU on which the task is running.
So __this_cpu_add() will charge that cputime to a random incorrect CPU.

Fixes: 73e6aafd9ea8 ("sched/cpuacct: Simplify the cpuacct code")
Reported-by: Minye Zhu <zhuminye@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220220051426.5274-1-zhouchengming@bytedance.com
---
 kernel/sched/cpuacct.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 3d06c5e4220d..307800586ac8 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -334,12 +334,13 @@ static struct cftype files[] = {
  */
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
+	unsigned int cpu = task_cpu(tsk);
 	struct cpuacct *ca;
 
 	rcu_read_lock();
 
 	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-		__this_cpu_add(*ca->cpuusage, cputime);
+		*per_cpu_ptr(ca->cpuusage, cpu) += cputime;
 
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From dc6e0818bc9a0336d9accf3ea35d146d72aa7a18 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 20 Feb 2022 13:14:25 +0800
Subject: sched/cpuacct: Optimize away RCU read lock

Since cpuacct_charge() is called from the scheduler update_curr(),
we must already have rq lock held, then the RCU read lock can
be optimized away.

And do the same thing in it's wrapper cgroup_account_cputime(),
but we can't use lockdep_assert_rq_held() there, which defined
in kernel/sched/sched.h.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220220051426.5274-2-zhouchengming@bytedance.com
---
 kernel/sched/cpuacct.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 307800586ac8..f79f88456d72 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -337,12 +337,10 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	unsigned int cpu = task_cpu(tsk);
 	struct cpuacct *ca;
 
-	rcu_read_lock();
+	lockdep_assert_rq_held(cpu_rq(cpu));
 
 	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
 		*per_cpu_ptr(ca->cpuusage, cpu) += cputime;
-
-	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.2.3


From 3eba0505d03a9c1eb30d40c2330c0880b22d1b3a Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sun, 20 Feb 2022 13:14:26 +0800
Subject: sched/cpuacct: Remove redundant RCU read lock

The cpuacct_account_field() and it's cgroup v2 wrapper
cgroup_account_cputime_field() is only called from cputime
in task_group_account_field(), which is already in RCU read-side
critical section. So remove these redundant RCU read lock.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220220051426.5274-3-zhouchengming@bytedance.com
---
 kernel/sched/cpuacct.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f79f88456d72..d269ede84e85 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -352,10 +352,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
 	struct cpuacct *ca;
 
-	rcu_read_lock();
 	for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
 		__this_cpu_add(ca->cpustat->cpustat[index], val);
-	rcu_read_unlock();
 }
 
 struct cgroup_subsys cpuacct_cgrp_subsys = {
-- 
cgit v1.2.3


From 49bef33e4b87b743495627a529029156c6e09530 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Thu, 27 Jan 2022 15:40:59 +0000
Subject: sched/rt: Plug rt_mutex_setprio() vs push_rt_task() race

John reported that push_rt_task() can end up invoking
find_lowest_rq(rq->curr) when curr is not an RT task (in this case a CFS
one), which causes mayhem down convert_prio().

This can happen when current gets demoted to e.g. CFS when releasing an
rt_mutex, and the local CPU gets hit with an rto_push_work irqwork before
getting the chance to reschedule. Exactly who triggers this work isn't
entirely clear to me - switched_from_rt() only invokes rt_queue_pull_task()
if there are no RT tasks on the local RQ, which means the local CPU can't
be in the rto_mask.

My current suspected sequence is something along the lines of the below,
with the demoted task being current.

  mark_wakeup_next_waiter()
    rt_mutex_adjust_prio()
      rt_mutex_setprio() // deboost originally-CFS task
	check_class_changed()
	  switched_from_rt() // Only rt_queue_pull_task() if !rq->rt.rt_nr_running
	  switched_to_fair() // Sets need_resched
      __balance_callbacks() // if pull_rt_task(), tell_cpu_to_push() can't select local CPU per the above
      raw_spin_rq_unlock(rq)

       // need_resched is set, so task_woken_rt() can't
       // invoke push_rt_tasks(). Best I can come up with is
       // local CPU has rt_nr_migratory >= 2 after the demotion, so stays
       // in the rto_mask, and then:

       <some other CPU running rto_push_irq_work_func() queues rto_push_work on this CPU>
	 push_rt_task()
	   // breakage follows here as rq->curr is CFS

Move an existing check to check rq->curr vs the next pushable task's
priority before getting anywhere near find_lowest_rq(). While at it, add an
explicit sched_class of rq->curr check prior to invoking
find_lowest_rq(rq->curr). Align the DL logic to also reschedule regardless
of next_task's migratability.

Fixes: a7c81556ec4d ("sched: Fix migrate_disable() vs rt/dl balancing")
Reported-by: John Keeping <john@metanate.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: John Keeping <john@metanate.com>
Link: https://lore.kernel.org/r/20220127154059.974729-1-valentin.schneider@arm.com
---
 kernel/sched/deadline.c | 12 ++++++------
 kernel/sched/rt.c       | 32 ++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d2c072b0ef01..62f0cf842277 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2240,12 +2240,6 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
-	if (is_migration_disabled(next_task))
-		return 0;
-
-	if (WARN_ON(next_task == rq->curr))
-		return 0;
-
 	/*
 	 * If next_task preempts rq->curr, and rq->curr
 	 * can move away, it makes sense to just reschedule
@@ -2258,6 +2252,12 @@ retry:
 		return 0;
 	}
 
+	if (is_migration_disabled(next_task))
+		return 0;
+
+	if (WARN_ON(next_task == rq->curr))
+		return 0;
+
 	/* We might release rq lock */
 	get_task_struct(next_task);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7b4f4fbbb404..14f273c29518 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2026,6 +2026,16 @@ static int push_rt_task(struct rq *rq, bool pull)
 		return 0;
 
 retry:
+	/*
+	 * It's possible that the next_task slipped in of
+	 * higher priority than current. If that's the case
+	 * just reschedule current.
+	 */
+	if (unlikely(next_task->prio < rq->curr->prio)) {
+		resched_curr(rq);
+		return 0;
+	}
+
 	if (is_migration_disabled(next_task)) {
 		struct task_struct *push_task = NULL;
 		int cpu;
@@ -2033,6 +2043,18 @@ retry:
 		if (!pull || rq->push_busy)
 			return 0;
 
+		/*
+		 * Invoking find_lowest_rq() on anything but an RT task doesn't
+		 * make sense. Per the above priority check, curr has to
+		 * be of higher priority than next_task, so no need to
+		 * reschedule when bailing out.
+		 *
+		 * Note that the stoppers are masqueraded as SCHED_FIFO
+		 * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+		 */
+		if (rq->curr->sched_class != &rt_sched_class)
+			return 0;
+
 		cpu = find_lowest_rq(rq->curr);
 		if (cpu == -1 || cpu == rq->cpu)
 			return 0;
@@ -2057,16 +2079,6 @@ retry:
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
-	/*
-	 * It's possible that the next_task slipped in of
-	 * higher priority than current. If that's the case
-	 * just reschedule current.
-	 */
-	if (unlikely(next_task->prio < rq->curr->prio)) {
-		resched_curr(rq);
-		return 0;
-	}
-
 	/* We might release rq lock */
 	get_task_struct(next_task);
 
-- 
cgit v1.2.3


From fa2c3254d7cfff5f7a916ab928a562d1165f17bb Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Thu, 20 Jan 2022 16:25:19 +0000
Subject: sched/tracing: Don't re-read p->state when emitting sched_switch
 event

As of commit

  c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

the following sequence becomes possible:

		      p->__state = TASK_INTERRUPTIBLE;
		      __schedule()
			deactivate_task(p);
  ttwu()
    READ !p->on_rq
    p->__state=TASK_WAKING
			trace_sched_switch()
			  __trace_sched_switch_state()
			    task_state_index()
			      return 0;

TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.

Prevent this by pushing the value read from __schedule() down the trace
event.

Reported-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com
---
 kernel/sched/core.c               | 4 ++--
 kernel/trace/fgraph.c             | 4 +++-
 kernel/trace/ftrace.c             | 4 +++-
 kernel/trace/trace_events.c       | 8 ++++++--
 kernel/trace/trace_osnoise.c      | 4 +++-
 kernel/trace/trace_sched_switch.c | 1 +
 kernel/trace/trace_sched_wakeup.c | 1 +
 7 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef946123e9af..3aafc15da24a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 {
 	struct rq *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
-	long prev_state;
+	unsigned int prev_state;
 
 	/*
 	 * The previous task will have left us with a preempt_count of 2
@@ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ free:
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-			struct task_struct *prev, struct task_struct *next)
+				unsigned int prev_state,
+				struct task_struct *prev,
+				struct task_struct *next)
 {
 	unsigned long long timestamp;
 	int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9feb197b2da..6762ae029fdd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-		    struct task_struct *prev, struct task_struct *next)
+				     unsigned int prev_state,
+				     struct task_struct *prev,
+				     struct task_struct *next)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..2a19ea747ff4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-		    struct task_struct *prev, struct task_struct *next)
+					unsigned int prev_state,
+					struct task_struct *prev,
+					struct task_struct *next)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-		    struct task_struct *prev, struct task_struct *next)
+					 unsigned int prev_state,
+					 struct task_struct *prev,
+					 struct task_struct *next)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 870a08da5b48..1829b4cb8cc1 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  * used to record the beginning and to report the end of a thread noise window.
  */
 static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+			    unsigned int prev_state,
+			    struct task_struct *p,
 			    struct task_struct *n)
 {
 	struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..993b0ed10d8c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
+		   unsigned int prev_state,
 		   struct task_struct *prev, struct task_struct *next)
 {
 	int flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
+			  unsigned int prev_state,
 			  struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
-- 
cgit v1.2.3


From 444e1154b2bf0b881b65ba1bba5bc8e691fac04a Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 15 Feb 2022 11:37:50 +0800
Subject: PM: hibernate: Clean up non-kernel-doc comments

Address the following W=1 kernel build warning:

kernel/power/swap.c:120: warning: This comment starts with '/**', but
isn't a kernel-doc comment. Refer
Documentation/doc-guide/kernel-doc.rst.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ad10359030a4..c51f5507b34f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -89,7 +89,7 @@ struct swap_map_page_list {
 	struct swap_map_page_list *next;
 };
 
-/**
+/*
  *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
  */
@@ -117,7 +117,7 @@ struct swsusp_header {
 
 static struct swsusp_header *swsusp_header;
 
-/**
+/*
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
  */
@@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
 	return 0;
 }
 
-/**
+/*
  *	alloc_swapdev_block - allocate a swap page and register that it has
  *	been allocated, so that it can be freed in case of an error.
  */
@@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap)
 	return 0;
 }
 
-/**
+/*
  *	free_all_swap_pages - free swap pages allocated for saving image data.
  *	It also frees the extents used to register which swap entries had been
  *	allocated.
-- 
cgit v1.2.3


From ba7ffcd4c4da374b0f64666354eeeda7d3827131 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 28 Feb 2022 14:05:32 -0800
Subject: PM: hibernate: fix __setup handler error handling

If an invalid value is used in "resumedelay=<seconds>", it is
silently ignored. Add a warning message and then let the __setup
handler return 1 to indicate that the kernel command line option
has been handled.

Fixes: 317cf7e5e85e3 ("PM / hibernate: convert simple_strtoul to kstrtoul")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 49d1df0218cb..0ac805b753e5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1330,7 +1330,7 @@ static int __init resumedelay_setup(char *str)
 	int rc = kstrtouint(str, 0, &resume_delay);
 
 	if (rc)
-		return rc;
+		pr_warn("resumedelay: bad option string '%s'\n", str);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 7a64ca17e4dd50d5f910769167f3553902777844 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 28 Feb 2022 14:05:44 -0800
Subject: PM: suspend: fix return value of __setup handler

If an invalid option is given for "test_suspend=<option>", the entire
string is added to init's environment, so return 1 instead of 0 from
the __setup handler.

  Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc5
    test_suspend=invalid"

and

 Run /sbin/init as init process
   with arguments:
     /sbin/init
   with environment:
     HOME=/
     TERM=linux
     BOOT_IMAGE=/boot/bzImage-517rc5
     test_suspend=invalid

Fixes: 2ce986892faf ("PM / sleep: Enhance test_suspend option with repeat capability")
Fixes: 27ddcc6596e5 ("PM / sleep: Add state field to pm_states[] entries")
Fixes: a9d7052363a6 ("PM: Separate suspend to RAM functionality from core")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend_test.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index d20526c5be15..b663a97f5867 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
 	value++;
 	suspend_type = strsep(&value, ",");
 	if (!suspend_type)
-		return 0;
+		return 1;
 
 	repeat = strsep(&value, ",");
 	if (repeat) {
 		if (kstrtou32(repeat, 0, &test_repeat_count_max))
-			return 0;
+			return 1;
 	}
 
 	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
 		if (!strcmp(pm_labels[i], suspend_type)) {
 			test_state_label = pm_labels[i];
-			return 0;
+			return 1;
 		}
 
 	printk(warn_bad_state, suspend_type);
-	return 0;
+	return 1;
 }
 __setup("test_suspend", setup_test_suspend);
 
-- 
cgit v1.2.3


From b665eae7a788c5e2bc10f9ac3c0137aa0ad1fc97 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 28 Feb 2022 14:05:56 -0800
Subject: printk: fix return value of printk.devkmsg __setup handler

If an invalid option value is used with "printk.devkmsg=<value>",
it is silently ignored.
If a valid option value is used, it is honored but the wrong return
value (0) is used, indicating that the command line option had an
error and was not handled. This string is not added to init's
environment strings due to init/main.c::unknown_bootoption()
checking for a '.' in the boot option string and then considering
that string to be an "Unused module parameter".

Print a warning message if a bad option string is used.
Always return 1 from the __setup handler to indicate that the command
line option has been handled.

Fixes: 750afe7babd1 ("printk: add kernel parameter to control writes to /dev/kmsg")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Cc: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Ogness <john.ogness@linutronix.de>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220228220556.23484-1-rdunlap@infradead.org
---
 kernel/printk/printk.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 266cc974b0e3..21fb4e1cd7db 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -146,8 +146,10 @@ static int __control_devkmsg(char *str)
 
 static int __init control_devkmsg(char *str)
 {
-	if (__control_devkmsg(str) < 0)
+	if (__control_devkmsg(str) < 0) {
+		pr_warn("printk.devkmsg: bad option string '%s'\n", str);
 		return 1;
+	}
 
 	/*
 	 * Set sysctl string accordingly:
@@ -166,7 +168,7 @@ static int __init control_devkmsg(char *str)
 	 */
 	devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
 
-	return 0;
+	return 1;
 }
 __setup("printk.devkmsg=", control_devkmsg);
 
-- 
cgit v1.2.3


From 676b2daabaf9a993db0e02a5ce79b984aaa0388b Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 2 Mar 2022 09:51:26 -0800
Subject: bpf, x86: Set header->size properly before freeing it

On do_jit failure path, the header is freed by bpf_jit_binary_pack_free.
While bpf_jit_binary_pack_free doesn't require proper ro_header->size,
bpf_prog_pack_free still uses it. Set header->size in bpf_int_jit_compile
before calling bpf_jit_binary_pack_free.

Fixes: 1022a5498f6f ("bpf, x86_64: Use bpf_jit_binary_pack_alloc")
Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Reported-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220302175126.247459-3-song@kernel.org
---
 kernel/bpf/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ebb0193d07f0..ab630f773ec1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1112,13 +1112,16 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
  *   1) when the program is freed after;
  *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
  * For case 2), we need to free both the RO memory and the RW buffer.
- * Also, ro_header->size in 2) is not properly set yet, so rw_header->size
- * is used for uncharge.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
  */
 void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
 			      struct bpf_binary_header *rw_header)
 {
-	u32 size = rw_header ? rw_header->size : ro_header->size;
+	u32 size = ro_header->size;
 
 	bpf_prog_pack_free(ro_header);
 	kvfree(rw_header);
-- 
cgit v1.2.3


From f5ff79fddf0efecca538046b5cc20fb3ded2ec4f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 26 Feb 2022 16:40:21 +0100
Subject: dma-mapping: remove CONFIG_DMA_REMAP

CONFIG_DMA_REMAP is used to build a few helpers around the core
vmalloc code, and to use them in case there is a highmem page in
dma-direct, and to make dma coherent allocations be able to use
non-contiguous pages allocations for DMA allocations in the dma-iommu
layer.

Right now it needs to be explicitly selected by architectures, and
is only done so by architectures that require remapping to deal
with devices that are not DMA coherent.  Make it unconditional for
builds with CONFIG_MMU as it is very little extra code, but makes
it much more likely that large DMA allocations succeed on x86.

This fixes hot plugging a NVMe thunderbolt SSD for me, which tries
to allocate a 1MB buffer that is otherwise hard to obtain due to
memory fragmentation on a heavily used laptop.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 kernel/dma/Kconfig  |  7 +------
 kernel/dma/Makefile |  2 +-
 kernel/dma/direct.c | 18 +++++++-----------
 3 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..56866aaa2ae1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL
 	select DMA_DECLARE_COHERENT
 	bool
 
-config DMA_REMAP
-	bool
-	depends on MMU
-	select DMA_NONCOHERENT_MMAP
-
 config DMA_DIRECT_REMAP
 	bool
-	select DMA_REMAP
 	select DMA_COHERENT_POOL
+	select DMA_NONCOHERENT_MMAP
 
 config DMA_CMA
 	bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
-obj-$(CONFIG_DMA_REMAP)			+= remap.o
+obj-$(CONFIG_MMU)			+= remap.o
 obj-$(CONFIG_DMA_MAP_BENCHMARK)		+= map_benchmark.o
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50f48e9e4598..35a1d29d6a2e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -265,17 +265,13 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
 	if (!page)
 		return NULL;
+
+	/*
+	 * dma_alloc_contiguous can return highmem pages depending on a
+	 * combination the cma= arguments and per-arch setup.  These need to be
+	 * remapped to return a kernel virtual address.
+	 */
 	if (PageHighMem(page)) {
-		/*
-		 * Depending on the cma= arguments and per-arch setup,
-		 * dma_alloc_contiguous could return highmem pages.
-		 * Without remapping there is no way to return them here, so
-		 * log an error and fail.
-		 */
-		if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
-			dev_info(dev, "Rejecting highmem page from CMA.\n");
-			goto out_free_pages;
-		}
 		remap = true;
 		set_uncached = false;
 	}
@@ -349,7 +345,7 @@ void dma_direct_free(struct device *dev, size_t size,
 	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
 		return;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+	if (is_vmalloc_addr(cpu_addr)) {
 		vunmap(cpu_addr);
 	} else {
 		if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
-- 
cgit v1.2.3


From 80e4390981618e290616dbd06ea190d4576f219d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 28 Feb 2022 14:04:53 -0800
Subject: dma-debug: fix return value of __setup handlers

When valid kernel command line parameters
  dma_debug=off dma_debug_entries=100
are used, they are reported as Unknown parameters and added to init's
environment strings, polluting it.

  Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc5
    dma_debug=off dma_debug_entries=100", will be passed to user space.

and

 Run /sbin/init as init process
   with arguments:
     /sbin/init
   with environment:
     HOME=/
     TERM=linux
     BOOT_IMAGE=/boot/bzImage-517rc5
     dma_debug=off
     dma_debug_entries=100

Return 1 from these __setup handlers to indicate that the command line
option has been handled.

Fixes: 59d3daafa1726 ("dma-debug: add kernel command line parameters")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..f8ff598596b8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str)
 		global_disable = true;
 	}
 
-	return 0;
+	return 1;
 }
 
 static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str)
 		return -EINVAL;
 	if (!get_option(&str, &nr_prealloc_entries))
 		nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
-	return 0;
+	return 1;
 }
 
 __setup("dma_debug=", dma_debug_cmdline);
-- 
cgit v1.2.3


From 7df5072cc05fd1aab5823bbc465d033cd292fca8 Mon Sep 17 00:00:00 2001
From: Mykola Lysenko <mykolal@fb.com>
Date: Tue, 1 Mar 2022 14:27:45 -0800
Subject: bpf: Small BPF verifier log improvements

In particular these include:

  1) Remove output of inv for scalars in print_verifier_state
  2) Replace inv with scalar in verifier error messages
  3) Remove _value suffixes for umin/umax/s32_min/etc (except map_value)
  4) Remove output of id=0
  5) Remove output of ref_obj_id=0

Signed-off-by: Mykola Lysenko <mykolal@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220301222745.1667206-1-mykolal@fb.com
---
 kernel/bpf/verifier.c | 64 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d7473fee247c..a57db4b2803c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -539,7 +539,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 	char postfix[16] = {0}, prefix[32] = {0};
 	static const char * const str[] = {
 		[NOT_INIT]		= "?",
-		[SCALAR_VALUE]		= "inv",
+		[SCALAR_VALUE]		= "scalar",
 		[PTR_TO_CTX]		= "ctx",
 		[CONST_PTR_TO_MAP]	= "map_ptr",
 		[PTR_TO_MAP_VALUE]	= "map_value",
@@ -685,74 +685,80 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			continue;
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
-		verbose(env, "=%s", reg_type_str(env, t));
+		verbose(env, "=");
 		if (t == SCALAR_VALUE && reg->precise)
 			verbose(env, "P");
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
+			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
+			const char *sep = "";
+
+			verbose(env, "%s", reg_type_str(env, t));
 			if (base_type(t) == PTR_TO_BTF_ID ||
 			    base_type(t) == PTR_TO_PERCPU_BTF_ID)
 				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
-			verbose(env, "(id=%d", reg->id);
-			if (reg_type_may_be_refcounted_or_null(t))
-				verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
+			verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+			if (reg->id)
+				verbose_a("id=%d", reg->id);
+			if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
 			if (t != SCALAR_VALUE)
-				verbose(env, ",off=%d", reg->off);
+				verbose_a("off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
-				verbose(env, ",r=%d", reg->range);
+				verbose_a("r=%d", reg->range);
 			else if (base_type(t) == CONST_PTR_TO_MAP ||
 				 base_type(t) == PTR_TO_MAP_KEY ||
 				 base_type(t) == PTR_TO_MAP_VALUE)
-				verbose(env, ",ks=%d,vs=%d",
-					reg->map_ptr->key_size,
-					reg->map_ptr->value_size);
+				verbose_a("ks=%d,vs=%d",
+					  reg->map_ptr->key_size,
+					  reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
 				/* Typically an immediate SCALAR_VALUE, but
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(env, ",imm=%llx", reg->var_off.value);
+				verbose_a("imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(env, ",smin_value=%lld",
-						(long long)reg->smin_value);
+					verbose_a("smin=%lld", (long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(env, ",smax_value=%lld",
-						(long long)reg->smax_value);
+					verbose_a("smax=%lld", (long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(env, ",umin_value=%llu",
-						(unsigned long long)reg->umin_value);
+					verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(env, ",umax_value=%llu",
-						(unsigned long long)reg->umax_value);
+					verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(env, ",var_off=%s", tn_buf);
+					verbose_a("var_off=%s", tn_buf);
 				}
 				if (reg->s32_min_value != reg->smin_value &&
 				    reg->s32_min_value != S32_MIN)
-					verbose(env, ",s32_min_value=%d",
-						(int)(reg->s32_min_value));
+					verbose_a("s32_min=%d", (int)(reg->s32_min_value));
 				if (reg->s32_max_value != reg->smax_value &&
 				    reg->s32_max_value != S32_MAX)
-					verbose(env, ",s32_max_value=%d",
-						(int)(reg->s32_max_value));
+					verbose_a("s32_max=%d", (int)(reg->s32_max_value));
 				if (reg->u32_min_value != reg->umin_value &&
 				    reg->u32_min_value != U32_MIN)
-					verbose(env, ",u32_min_value=%d",
-						(int)(reg->u32_min_value));
+					verbose_a("u32_min=%d", (int)(reg->u32_min_value));
 				if (reg->u32_max_value != reg->umax_value &&
 				    reg->u32_max_value != U32_MAX)
-					verbose(env, ",u32_max_value=%d",
-						(int)(reg->u32_max_value));
+					verbose_a("u32_max=%d", (int)(reg->u32_max_value));
 			}
+#undef verbose_a
+
 			verbose(env, ")");
 		}
 	}
@@ -777,7 +783,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		if (is_spilled_reg(&state->stack[i])) {
 			reg = &state->stack[i].spilled_ptr;
 			t = reg->type;
-			verbose(env, "=%s", reg_type_str(env, t));
+			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
 			if (t == SCALAR_VALUE && reg->precise)
 				verbose(env, "P");
 			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
-- 
cgit v1.2.3


From bf9ad37dc8a30cce22ae95d6c2ca6abf8731d305 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 14 Jul 2015 14:26:34 +0200
Subject: signal, x86: Delay calling signals in atomic on RT enabled kernels

On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.

When CONFIG_PREEMPT_RT is enabled, spinlock_t locks become sleeping, and
one of these is the spin lock used in signal handling.

Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spinlock_t lock that has been converted to a
sleeping lock. If this happens, the above issues with the corrupted
stack is possible.

Instead of calling the signal right away, for PREEMPT_RT and x86,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.

[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT to
  ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]
[bigeasy: Add on 32bit as per Yang Shi, minor rewording. ]
[ tglx: Use a config option ]

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/Ygq5aBB/qMQw6aP5@linutronix.de
---
 kernel/Kconfig.preempt | 12 +++++++++++-
 kernel/entry/common.c  | 14 ++++++++++++++
 kernel/signal.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..5644abd5f8a8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -132,4 +132,14 @@ config SCHED_CORE
 	  which is the likely usage by Linux distributions, there should
 	  be no measurable impact on performance.
 
-
+config ARCH_WANTS_RT_DELAYED_SIGNALS
+	bool
+	help
+	  This option is selected by architectures where raising signals
+	  can happen in atomic contexts on PREEMPT_RT enabled kernels. This
+	  option delays raising the signal until the return to user space
+	  loop where it is also delivered. X86 requires this to deliver
+	  signals from trap handlers which run on IST stacks.
+
+config RT_DELAYED_SIGNALS
+	def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..0543a2c92f20 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -148,6 +148,18 @@ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
 	arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
 }
 
+#ifdef CONFIG_RT_DELAYED_SIGNALS
+static inline void raise_delayed_signal(void)
+{
+	if (unlikely(current->forced_info.si_signo)) {
+		force_sig_info(&current->forced_info);
+		current->forced_info.si_signo = 0;
+	}
+}
+#else
+static inline void raise_delayed_signal(void) { }
+#endif
+
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 					    unsigned long ti_work)
 {
@@ -162,6 +174,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_NEED_RESCHED)
 			schedule();
 
+		raise_delayed_signal();
+
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b04631acde8..e93de6daa188 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1307,6 +1307,43 @@ enum sig_handler {
 	HANDLER_EXIT,	 /* Only visible as the process exit code */
 };
 
+/*
+ * On some archictectures, PREEMPT_RT has to delay sending a signal from a
+ * trap since it cannot enable preemption, and the signal code's
+ * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME
+ * which will send the signal on exit of the trap.
+ */
+#ifdef CONFIG_RT_DELAYED_SIGNALS
+static inline bool force_sig_delayed(struct kernel_siginfo *info,
+				     struct task_struct *t)
+{
+	if (!in_atomic())
+		return false;
+
+	if (WARN_ON_ONCE(t->forced_info.si_signo))
+		return true;
+
+	if (is_si_special(info)) {
+		WARN_ON_ONCE(info != SEND_SIG_PRIV);
+		t->forced_info.si_signo = info->si_signo;
+		t->forced_info.si_errno = 0;
+		t->forced_info.si_code = SI_KERNEL;
+		t->forced_info.si_pid = 0;
+		t->forced_info.si_uid = 0;
+	} else {
+		t->forced_info = *info;
+	}
+	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+	return true;
+}
+#else
+static inline bool force_sig_delayed(struct kernel_siginfo *info,
+				     struct task_struct *t)
+{
+	return false;
+}
+#endif
+
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1327,6 +1364,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 	struct k_sigaction *action;
 	int sig = info->si_signo;
 
+	if (force_sig_delayed(info, t))
+		return 0;
+
 	spin_lock_irqsave(&t->sighand->siglock, flags);
 	action = &t->sighand->action[sig-1];
 	ignored = action->sa.sa_handler == SIG_IGN;
-- 
cgit v1.2.3


From 25b35dd28138f61f9a0fb8b76c0483761fd228bd Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 5 Mar 2022 04:16:38 +0530
Subject: bpf: Add check_func_arg_reg_off function

Lift the list of register types allowed for having fixed and variable
offsets when passed as helper function arguments into a common helper,
so that they can be reused for kfunc checks in later commits. Keeping a
common helper aids maintainability and allows us to follow the same
consistent rules across helpers and kfuncs. Also, convert check_func_arg
to use this function.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220304224645.3677453-2-memxor@gmail.com
---
 kernel/bpf/verifier.c | 69 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a57db4b2803c..e37eb6020253 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5359,6 +5359,44 @@ found:
 	return 0;
 }
 
+int check_func_arg_reg_off(struct bpf_verifier_env *env,
+			   const struct bpf_reg_state *reg, int regno,
+			   enum bpf_arg_type arg_type)
+{
+	enum bpf_reg_type type = reg->type;
+	bool fixed_off_ok = false;
+
+	switch ((u32)type) {
+	case SCALAR_VALUE:
+	/* Pointer types where reg offset is explicitly allowed: */
+	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
+	case PTR_TO_MAP_KEY:
+	case PTR_TO_MAP_VALUE:
+	case PTR_TO_MEM:
+	case PTR_TO_MEM | MEM_RDONLY:
+	case PTR_TO_MEM | MEM_ALLOC:
+	case PTR_TO_BUF:
+	case PTR_TO_BUF | MEM_RDONLY:
+	case PTR_TO_STACK:
+		/* Some of the argument types nevertheless require a
+		 * zero register offset.
+		 */
+		if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+			return 0;
+		break;
+	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+	 * fixed offset.
+	 */
+	case PTR_TO_BTF_ID:
+		fixed_off_ok = true;
+		break;
+	default:
+		break;
+	}
+	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn)
@@ -5408,34 +5446,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	if (err)
 		return err;
 
-	switch ((u32)type) {
-	case SCALAR_VALUE:
-	/* Pointer types where reg offset is explicitly allowed: */
-	case PTR_TO_PACKET:
-	case PTR_TO_PACKET_META:
-	case PTR_TO_MAP_KEY:
-	case PTR_TO_MAP_VALUE:
-	case PTR_TO_MEM:
-	case PTR_TO_MEM | MEM_RDONLY:
-	case PTR_TO_MEM | MEM_ALLOC:
-	case PTR_TO_BUF:
-	case PTR_TO_BUF | MEM_RDONLY:
-	case PTR_TO_STACK:
-		/* Some of the argument types nevertheless require a
-		 * zero register offset.
-		 */
-		if (arg_type == ARG_PTR_TO_ALLOC_MEM)
-			goto force_off_check;
-		break;
-	/* All the rest must be rejected: */
-	default:
-force_off_check:
-		err = __check_ptr_off_reg(env, reg, regno,
-					  type == PTR_TO_BTF_ID);
-		if (err < 0)
-			return err;
-		break;
-	}
+	err = check_func_arg_reg_off(env, reg, regno, arg_type);
+	if (err)
+		return err;
 
 skip_type_check:
 	if (reg->ref_obj_id) {
-- 
cgit v1.2.3


From 655efe5089f077485eec848272bd7e26b1a5a735 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 5 Mar 2022 04:16:39 +0530
Subject: bpf: Fix PTR_TO_BTF_ID var_off check

When kfunc support was added, check_ctx_reg was called for PTR_TO_CTX
register, but no offset checks were made for PTR_TO_BTF_ID. Only
reg->off was taken into account by btf_struct_ids_match, which protected
against type mismatch due to non-zero reg->off, but when reg->off was
zero, a user could set the variable offset of the register and allow it
to be passed to kfunc, leading to bad pointer being passed into the
kernel.

Fix this by reusing the extracted helper check_func_arg_reg_off from
previous commit, and make one call before checking all supported
register types. Since the list is maintained, any future changes will be
taken into account by updating check_func_arg_reg_off. This function
prevents non-zero var_off to be set for PTR_TO_BTF_ID, but still allows
a fixed non-zero reg->off, which is needed for type matching to work
correctly when using pointer arithmetic.

ARG_DONTCARE is passed as arg_type, since kfunc doesn't support
accepting a ARG_PTR_TO_ALLOC_MEM without relying on size of parameter
type from BTF (in case of pointer), or using a mem, len pair. The
forcing of offset check for ARG_PTR_TO_ALLOC_MEM is done because ringbuf
helpers obtain the size from the header located at the beginning of the
memory region, hence any changes to the original pointer shouldn't be
allowed. In case of kfunc, size is always known, either at verification
time, or using the length parameter, hence this forcing is not required.

Since this check will happen once already for PTR_TO_CTX, remove the
check_ptr_off_reg call inside its block.

Fixes: e6ac2450d6de ("bpf: Support bpf program calling kernel function")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220304224645.3677453-3-memxor@gmail.com
---
 kernel/bpf/btf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b472cf0c8fdb..7f6a0ae5028b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5726,7 +5726,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	int ref_regno = 0;
+	int ref_regno = 0, ret;
 	bool rel = false;
 
 	t = btf_type_by_id(btf, func_id);
@@ -5776,6 +5776,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+		ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+		if (ret < 0)
+			return ret;
+
 		if (btf_get_prog_ctx_type(log, btf, t,
 					  env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
@@ -5787,8 +5792,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					i, btf_type_str(t));
 				return -EINVAL;
 			}
-			if (check_ptr_off_reg(env, reg, regno))
-				return -EINVAL;
 		} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
 			   (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
 			const struct btf_type *reg_ref_t;
-- 
cgit v1.2.3


From e1fad0ff46b32819d30cb487f1d39ba24e515843 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 5 Mar 2022 04:16:40 +0530
Subject: bpf: Disallow negative offset in check_ptr_off_reg

check_ptr_off_reg only allows fixed offset to be set for PTR_TO_BTF_ID,
where reg->off < 0 doesn't make sense. This would shift the pointer
backwards, and fails later in btf_struct_ids_match or btf_struct_walk
due to out of bounds access (since offset is interpreted as unsigned).

Improve the verifier by rejecting this case by using a better error
message for BPF helpers and kfunc, by putting a check inside the
check_func_arg_reg_off function.

Also, update existing verifier selftests to work with new error string.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220304224645.3677453-4-memxor@gmail.com
---
 kernel/bpf/verifier.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e37eb6020253..455b4ab69e47 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3990,6 +3990,12 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
 	 * is only allowed in its original, unmodified form.
 	 */
 
+	if (reg->off < 0) {
+		verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+			reg_type_str(env, reg->type), regno, reg->off);
+		return -EACCES;
+	}
+
 	if (!fixed_off_ok && reg->off) {
 		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
 			reg_type_str(env, reg->type), regno, reg->off);
-- 
cgit v1.2.3


From 24d5bb806c7e2c0b9972564fd493069f612d90dd Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 5 Mar 2022 04:16:41 +0530
Subject: bpf: Harden register offset checks for release helpers and kfuncs

Let's ensure that the PTR_TO_BTF_ID reg being passed in to release BPF
helpers and kfuncs always has its offset set to 0. While not a real
problem now, there's a very real possibility this will become a problem
when more and more kfuncs are exposed, and more BPF helpers are added
which can release PTR_TO_BTF_ID.

Previous commits already protected against non-zero var_off. One of the
case we are concerned about now is when we have a type that can be
returned by e.g. an acquire kfunc:

struct foo {
	int a;
	int b;
	struct bar b;
};

... and struct bar is also a type that can be returned by another
acquire kfunc.

Then, doing the following sequence:

	struct foo *f = bpf_get_foo(); // acquire kfunc
	if (!f)
		return 0;
	bpf_put_bar(&f->b); // release kfunc

... would work with the current code, since the btf_struct_ids_match
takes reg->off into account for matching pointer type with release kfunc
argument type, but would obviously be incorrect, and most likely lead to
a kernel crash. A test has been included later to prevent regressions in
this area.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220304224645.3677453-5-memxor@gmail.com
---
 kernel/bpf/btf.c      | 33 +++++++++++++++++++--------------
 kernel/bpf/verifier.c | 25 ++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7f6a0ae5028b..162807e3b4a5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5753,6 +5753,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/* Only kfunc can be release func */
+	if (is_kfunc)
+		rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+						BTF_KFUNC_TYPE_RELEASE, func_id);
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
@@ -5777,7 +5781,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
-		ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+		ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
 		if (ret < 0)
 			return ret;
 
@@ -5809,7 +5813,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (reg->type == PTR_TO_BTF_ID) {
 				reg_btf = reg->btf;
 				reg_ref_id = reg->btf_id;
-				/* Ensure only one argument is referenced PTR_TO_BTF_ID */
+				/* Ensure only one argument is referenced
+				 * PTR_TO_BTF_ID, check_func_arg_reg_off relies
+				 * on only one referenced register being allowed
+				 * for kfuncs.
+				 */
 				if (reg->ref_obj_id) {
 					if (ref_obj_id) {
 						bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5891,18 +5899,15 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 	/* Either both are set, or neither */
 	WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
-	if (is_kfunc) {
-		rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
-						BTF_KFUNC_TYPE_RELEASE, func_id);
-		/* We already made sure ref_obj_id is set only for one argument */
-		if (rel && !ref_obj_id) {
-			bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
-				func_name);
-			return -EINVAL;
-		}
-		/* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to
-		 * other kfuncs works
-		 */
+	/* We already made sure ref_obj_id is set only for one argument. We do
+	 * allow (!rel && ref_obj_id), so that passing such referenced
+	 * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
+	 * is_kfunc is true.
+	 */
+	if (rel && !ref_obj_id) {
+		bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+			func_name);
+		return -EINVAL;
 	}
 	/* returns argument register number > 0 in case of reference release kfunc */
 	return rel ? ref_regno : 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 455b4ab69e47..fe9a513e2314 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5367,10 +5367,11 @@ found:
 
 int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type)
+			   enum bpf_arg_type arg_type,
+			   bool is_release_func)
 {
+	bool fixed_off_ok = false, release_reg;
 	enum bpf_reg_type type = reg->type;
-	bool fixed_off_ok = false;
 
 	switch ((u32)type) {
 	case SCALAR_VALUE:
@@ -5395,6 +5396,21 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	 * fixed offset.
 	 */
 	case PTR_TO_BTF_ID:
+		/* When referenced PTR_TO_BTF_ID is passed to release function,
+		 * it's fixed offset must be 0. We rely on the property that
+		 * only one referenced register can be passed to BPF helpers and
+		 * kfuncs. In the other cases, fixed offset can be non-zero.
+		 */
+		release_reg = is_release_func && reg->ref_obj_id;
+		if (release_reg && reg->off) {
+			verbose(env, "R%d must have zero offset when passed to release func\n",
+				regno);
+			return -EINVAL;
+		}
+		/* For release_reg == true, fixed_off_ok must be false, but we
+		 * already checked and rejected reg->off != 0 above, so set to
+		 * true to allow fixed offset for all other cases.
+		 */
 		fixed_off_ok = true;
 		break;
 	default:
@@ -5452,11 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	if (err)
 		return err;
 
-	err = check_func_arg_reg_off(env, reg, regno, arg_type);
+	err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
 	if (err)
 		return err;
 
 skip_type_check:
+	/* check_func_arg_reg_off relies on only one referenced register being
+	 * allowed for BPF helpers.
+	 */
 	if (reg->ref_obj_id) {
 		if (meta->ref_obj_id) {
 			verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
-- 
cgit v1.2.3


From bff61f6faedb36db6b135da898840d29aa74cbbb Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Fri, 4 Mar 2022 11:16:54 -0800
Subject: bpf: Fix checking PTR_TO_BTF_ID in check_mem_access

With the introduction of MEM_USER in

 commit c6f1bfe89ac9 ("bpf: reject program if a __user tagged memory accessed in kernel way")

PTR_TO_BTF_ID can be combined with a MEM_USER tag. Therefore, most
likely, when we compare reg_type against PTR_TO_BTF_ID, we want to use
the reg's base_type. Previously the check in check_mem_access() wants
to say: if the reg is BTF_ID but not NULL, the execution flow falls
into the 'then' branch. But now a reg of (BTF_ID | MEM_USER), which
should go into the 'then' branch, goes into the 'else'.

The end results before and after this patch are the same: regs tagged
with MEM_USER get rejected, but not in a way we intended. So fix the
condition, the error message now is correct.

Before (log from commit 696c39011538):

  $ ./test_progs -v -n 22/3
  ...
  libbpf: prog 'test_user1': BPF program load failed: Permission denied
  libbpf: prog 'test_user1': -- BEGIN PROG LOAD LOG --
  R1 type=ctx expected=fp
  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  ; int BPF_PROG(test_user1, struct bpf_testmod_btf_type_tag_1 *arg)
  0: (79) r1 = *(u64 *)(r1 +0)
  func 'bpf_testmod_test_btf_type_tag_user_1' arg0 has btf_id 136561 type STRUCT 'bpf_testmod_btf_type_tag_1'
  1: R1_w=user_ptr_bpf_testmod_btf_type_tag_1(id=0,off=0,imm=0)
  ; g = arg->a;
  1: (61) r1 = *(u32 *)(r1 +0)
  R1 invalid mem access 'user_ptr_'

Now:

  libbpf: prog 'test_user1': BPF program load failed: Permission denied
  libbpf: prog 'test_user1': -- BEGIN PROG LOAD LOG --
  R1 type=ctx expected=fp
  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  ; int BPF_PROG(test_user1, struct bpf_testmod_btf_type_tag_1 *arg)
  0: (79) r1 = *(u64 *)(r1 +0)
  func 'bpf_testmod_test_btf_type_tag_user_1' arg0 has btf_id 104036 type STRUCT 'bpf_testmod_btf_type_tag_1'
  1: R1_w=user_ptr_bpf_testmod_btf_type_tag_1(id=0,ref_obj_id=0,off=0,imm=0)
  ; g = arg->a;
  1: (61) r1 = *(u32 *)(r1 +0)
  R1 is ptr_bpf_testmod_btf_type_tag_1 access user memory: off=0

Note the error message for the reason of rejection.

Fixes: c6f1bfe89ac9 ("bpf: reject program if a __user tagged memory accessed in kernel way")
Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220304191657.981240-2-haoluo@google.com
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fe9a513e2314..7a6b58fea37d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4562,7 +4562,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_tp_buffer_access(env, reg, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
-	} else if (reg->type == PTR_TO_BTF_ID) {
+	} else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+		   !type_may_be_null(reg->type)) {
 		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
 					      value_regno);
 	} else if (reg->type == CONST_PTR_TO_MAP) {
-- 
cgit v1.2.3


From 5844101a1be9b8636024cb31c865ef13c7cc6db3 Mon Sep 17 00:00:00 2001
From: Hao Luo <haoluo@google.com>
Date: Fri, 4 Mar 2022 11:16:56 -0800
Subject: bpf: Reject programs that try to load __percpu memory.

With the introduction of the btf_type_tag "percpu", we can add a
MEM_PERCPU to identify those pointers that point to percpu memory.
The ability of differetiating percpu pointers from regular memory
pointers have two benefits:

 1. It forbids unexpected use of percpu pointers, such as direct loads.
    In kernel, there are special functions used for accessing percpu
    memory. Directly loading percpu memory is meaningless. We already
    have BPF helpers like bpf_per_cpu_ptr() and bpf_this_cpu_ptr() that
    wrap the kernel percpu functions. So we can now convert percpu
    pointers into regular pointers in a safe way.

 2. Previously, bpf_per_cpu_ptr() and bpf_this_cpu_ptr() only work on
    PTR_TO_PERCPU_BTF_ID, a special reg_type which describes static
    percpu variables in kernel (we rely on pahole to encode them into
    vmlinux BTF). Now, since we can identify __percpu tagged pointers,
    we can also identify dynamically allocated percpu memory as well.
    It means we can use bpf_xxx_cpu_ptr() on dynamic percpu memory.
    This would be very convenient when accessing fields like
    "cgroup->rstat_cpu".

Signed-off-by: Hao Luo <haoluo@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220304191657.981240-4-haoluo@google.com
---
 kernel/bpf/btf.c      |  8 +++++++-
 kernel/bpf/verifier.c | 24 ++++++++++++++----------
 2 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 162807e3b4a5..8b34563a832e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5057,6 +5057,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		tag_value = __btf_name_by_offset(btf, t->name_off);
 		if (strcmp(tag_value, "user") == 0)
 			info->reg_type |= MEM_USER;
+		if (strcmp(tag_value, "percpu") == 0)
+			info->reg_type |= MEM_PERCPU;
 	}
 
 	/* skip modifiers */
@@ -5285,12 +5287,16 @@ error:
 				return -EACCES;
 			}
 
-			/* check __user tag */
+			/* check type tag */
 			t = btf_type_by_id(btf, mtype->type);
 			if (btf_type_is_type_tag(t)) {
 				tag_value = __btf_name_by_offset(btf, t->name_off);
+				/* check __user tag */
 				if (strcmp(tag_value, "user") == 0)
 					tmp_flag = MEM_USER;
+				/* check __percpu tag */
+				if (strcmp(tag_value, "percpu") == 0)
+					tmp_flag = MEM_PERCPU;
 			}
 
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7a6b58fea37d..ec3a7b6c9515 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -554,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_TP_BUFFER]	= "tp_buffer",
 		[PTR_TO_XDP_SOCK]	= "xdp_sock",
 		[PTR_TO_BTF_ID]		= "ptr_",
-		[PTR_TO_PERCPU_BTF_ID]	= "percpu_ptr_",
 		[PTR_TO_MEM]		= "mem",
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
@@ -562,8 +561,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 	};
 
 	if (type & PTR_MAYBE_NULL) {
-		if (base_type(type) == PTR_TO_BTF_ID ||
-		    base_type(type) == PTR_TO_PERCPU_BTF_ID)
+		if (base_type(type) == PTR_TO_BTF_ID)
 			strncpy(postfix, "or_null_", 16);
 		else
 			strncpy(postfix, "_or_null", 16);
@@ -575,6 +573,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		strncpy(prefix, "alloc_", 32);
 	if (type & MEM_USER)
 		strncpy(prefix, "user_", 32);
+	if (type & MEM_PERCPU)
+		strncpy(prefix, "percpu_", 32);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
@@ -697,8 +697,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			const char *sep = "";
 
 			verbose(env, "%s", reg_type_str(env, t));
-			if (base_type(t) == PTR_TO_BTF_ID ||
-			    base_type(t) == PTR_TO_PERCPU_BTF_ID)
+			if (base_type(t) == PTR_TO_BTF_ID)
 				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
 			verbose(env, "(");
 /*
@@ -2783,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BUF:
-	case PTR_TO_PERCPU_BTF_ID:
 	case PTR_TO_MEM:
 	case PTR_TO_FUNC:
 	case PTR_TO_MAP_KEY:
@@ -4203,6 +4201,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	if (reg->type & MEM_PERCPU) {
+		verbose(env,
+			"R%d is ptr_%s access percpu memory: off=%d\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+
 	if (env->ops->btf_struct_access) {
 		ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
 						  off, size, atype, &btf_id, &flag);
@@ -4809,7 +4814,7 @@ static int check_stack_range_initialized(
 		}
 
 		if (is_spilled_reg(&state->stack[spi]) &&
-		    state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+		    base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
 			goto mark;
 
 		if (is_spilled_reg(&state->stack[spi]) &&
@@ -5265,7 +5270,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -9677,7 +9682,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			dst_reg->mem_size = aux->btf_var.mem_size;
 			break;
 		case PTR_TO_BTF_ID:
-		case PTR_TO_PERCPU_BTF_ID:
 			dst_reg->btf = aux->btf_var.btf;
 			dst_reg->btf_id = aux->btf_var.btf_id;
 			break;
@@ -11877,7 +11881,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
 	type = t->type;
 	t = btf_type_skip_modifiers(btf, type, NULL);
 	if (percpu) {
-		aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+		aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
 		aux->btf_var.btf = btf;
 		aux->btf_var.btf_id = type;
 	} else if (!btf_type_is_struct(t)) {
-- 
cgit v1.2.3


From a1ff03cd6fb9c501fff63a4a2bface9adcfa81cd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 2 Feb 2022 01:01:07 +0100
Subject: tick: Detect and fix jiffies update stall

On some rare cases, the timekeeper CPU may be delaying its jiffies
update duty for a while. Known causes include:

* The timekeeper is waiting on stop_machine in a MULTI_STOP_DISABLE_IRQ
  or MULTI_STOP_RUN state. Disabled interrupts prevent from timekeeping
  updates while waiting for the target CPU to complete its
  stop_machine() callback.

* The timekeeper vcpu has VMEXIT'ed for a long while due to some overload
  on the host.

Detect and fix these situations with emergency timekeeping catchups.

Original-patch-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 17 +++++++++++++++++
 kernel/time/tick-sched.h |  4 ++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 17a283ce2b20..c89f50a7e690 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -169,6 +169,8 @@ static ktime_t tick_init_jiffy_update(void)
 	return period;
 }
 
+#define MAX_STALLED_JIFFIES 5
+
 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 {
 	int cpu = smp_processor_id();
@@ -196,6 +198,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	if (tick_do_timer_cpu == cpu)
 		tick_do_update_jiffies64(now);
 
+	/*
+	 * If jiffies update stalled for too long (timekeeper in stop_machine()
+	 * or VMEXIT'ed for several msecs), force an update.
+	 */
+	if (ts->last_tick_jiffies != jiffies) {
+		ts->stalled_jiffies = 0;
+		ts->last_tick_jiffies = READ_ONCE(jiffies);
+	} else {
+		if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
+			tick_do_update_jiffies64(now);
+			ts->stalled_jiffies = 0;
+			ts->last_tick_jiffies = READ_ONCE(jiffies);
+		}
+	}
+
 	if (ts->inidle)
 		ts->got_idle_tick = 1;
 }
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index d952ae393423..504649513399 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -49,6 +49,8 @@ enum tick_nohz_mode {
  * @timer_expires_base:	Base time clock monotonic for @timer_expires
  * @next_timer:		Expiry time of next expiring timer for debugging purpose only
  * @tick_dep_mask:	Tick dependency mask - is set, if someone needs the tick
+ * @last_tick_jiffies:	Value of jiffies seen on last tick
+ * @stalled_jiffies:	Number of stalled jiffies detected across ticks
  */
 struct tick_sched {
 	struct hrtimer			sched_timer;
@@ -77,6 +79,8 @@ struct tick_sched {
 	u64				next_timer;
 	ktime_t				idle_expires;
 	atomic_t			tick_dep_mask;
+	unsigned long			last_tick_jiffies;
+	unsigned int			stalled_jiffies;
 };
 
 extern struct tick_sched *tick_get_tick_sched(int cpu);
-- 
cgit v1.2.3


From 2984539959dbaf4e65e19bf90c2419304a81a985 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 8 Feb 2022 17:16:33 +0100
Subject: tick/rcu: Remove obsolete rcu_needs_cpu() parameters

With the removal of CONFIG_RCU_FAST_NO_HZ, the parameters in
rcu_needs_cpu() are not necessary anymore. Simply remove them.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
---
 kernel/rcu/tree.c        |  3 +--
 kernel/time/tick-sched.c | 10 ++++------
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..80faf2273ce9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1086,9 +1086,8 @@ void rcu_irq_enter_irqson(void)
  * Just check whether or not this CPU has non-offloaded RCU callbacks
  * queued.
  */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
+int rcu_needs_cpu(void)
 {
-	*nextevt = KTIME_MAX;
 	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
 		!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c89f50a7e690..566ad5bd83e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -785,7 +785,7 @@ static inline bool local_timer_softirq_pending(void)
 
 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
-	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+	u64 basemono, next_tick, delta, expires;
 	unsigned long basejiff;
 	unsigned int seq;
 
@@ -808,7 +808,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 	 * minimal delta which brings us back to this place
 	 * immediately. Lather, rinse and repeat...
 	 */
-	if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+	if (rcu_needs_cpu() || arch_needs_cpu() ||
 	    irq_work_needs_cpu() || local_timer_softirq_pending()) {
 		next_tick = basemono + TICK_NSEC;
 	} else {
@@ -819,10 +819,8 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 		 * disabled this also looks at the next expiring
 		 * hrtimer.
 		 */
-		next_tmr = get_next_timer_interrupt(basejiff, basemono);
-		ts->next_timer = next_tmr;
-		/* Take the next rcu event into account */
-		next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
+		next_tick = get_next_timer_interrupt(basejiff, basemono);
+		ts->next_timer = next_tick;
 	}
 
 	/*
-- 
cgit v1.2.3


From 0345691b24c076655ce8f0f4bfd24cba3467ccbd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 8 Feb 2022 17:16:34 +0100
Subject: tick/rcu: Stop allowing RCU_SOFTIRQ in idle

RCU_SOFTIRQ used to be special in that it could be raised on purpose
within the idle path to prevent from stopping the tick. Some code still
prevents from unnecessary warnings related to this specific behaviour
while entering in dynticks-idle mode.

However the nohz layout has changed quite a bit in ten years, and the
removal of CONFIG_RCU_FAST_NO_HZ has been the final straw to this
safe-conduct. Now the RCU_SOFTIRQ vector is expected to be raised from
sane places.

A remaining corner case is admitted though when the vector is invoked
in fragile hotplug path.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
---
 kernel/time/tick-sched.c | 50 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 566ad5bd83e9..2d76c91b85de 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -999,6 +999,45 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	__tick_nohz_full_update_tick(ts, ktime_get());
 }
 
+/*
+ * A pending softirq outside an IRQ (or softirq disabled section) context
+ * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
+ * reach here due to the need_resched() early check in can_stop_idle_tick().
+ *
+ * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
+ * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
+ * triggering the below since wakep_softirqd() is ignored.
+ *
+ */
+static bool report_idle_softirq(void)
+{
+	static int ratelimit;
+	unsigned int pending = local_softirq_pending();
+
+	if (likely(!pending))
+		return false;
+
+	/* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
+	if (!cpu_active(smp_processor_id())) {
+		pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
+		if (!pending)
+			return false;
+	}
+
+	if (ratelimit < 10)
+		return false;
+
+	/* On RT, softirqs handling may be waiting on some lock */
+	if (!local_bh_blocked())
+		return false;
+
+	pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+		pending);
+	ratelimit++;
+
+	return true;
+}
+
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
 	/*
@@ -1025,17 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	if (need_resched())
 		return false;
 
-	if (unlikely(local_softirq_pending())) {
-		static int ratelimit;
-
-		if (ratelimit < 10 && !local_bh_blocked() &&
-		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
-				(unsigned int) local_softirq_pending());
-			ratelimit++;
-		}
+	if (unlikely(report_idle_softirq()))
 		return false;
-	}
 
 	if (tick_nohz_full_enabled()) {
 		/*
-- 
cgit v1.2.3


From 03b9c7fa3f15f51bcd07f3828c2a01311e7746c4 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Fri, 4 Mar 2022 15:04:08 +0800
Subject: bpf: Replace strncpy() with strscpy()

Using strncpy() on NUL-terminated strings is considered deprecated[1].
Moreover, if the length of 'task->comm' is less than the destination buffer
size, strncpy() will NUL-pad the destination buffer, which is a needless
performance penalty.

Replacing strncpy() with strscpy() fixes all these issues.

[1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220304070408.233658-1-ytcoode@gmail.com
---
 kernel/bpf/helpers.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ae64110a98b5..315053ef6a75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -225,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 	if (unlikely(!task))
 		goto err_clear;
 
-	strncpy(buf, task->comm, size);
-
-	/* Verifier guarantees that size > 0. For task->comm exceeding
-	 * size, guarantee that buf is %NUL-terminated. Unconditionally
-	 * done here to save the size test.
-	 */
-	buf[size - 1] = 0;
+	/* Verifier guarantees that size > 0 */
+	strscpy(buf, task->comm, size);
 	return 0;
 err_clear:
 	memset(buf, 0, size);
-- 
cgit v1.2.3


From 4989135a85334337ac8b7e42e7ee1a569ad5f7f5 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sun, 6 Mar 2022 00:10:13 +0800
Subject: bpf: Remove redundant slash

The trailing slash of LIBBPF_SRCS is redundant, remove it. Also inline
it as its only used in LIBBPF_INCLUDE.

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220305161013.361646-1-ytcoode@gmail.com
---
 kernel/bpf/preload/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 167534e3b0b4..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_INCLUDE = $(LIBBPF_SRCS)/..
+LIBBPF_INCLUDE = $(srctree)/tools/lib
 
 obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-CFLAGS_bpf_preload_kern.o += -I $(LIBBPF_INCLUDE)
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
 bpf_preload-objs += bpf_preload_kern.o
-- 
cgit v1.2.3


From 44e9a741cad824f45112b79c21d88c201d5aec13 Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Mon, 7 Mar 2022 13:29:18 +0800
Subject: bpf: Determine buf_info inside check_buffer_access()

Instead of determining buf_info string in the caller of check_buffer_access(),
we can determine whether the register type is read-only through
type_is_rdonly_mem() helper inside check_buffer_access() and construct
buf_info, making the code slightly cleaner.

Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/YiWYLnAkEZXBP/gH@syu-laptop
---
 kernel/bpf/verifier.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ec3a7b6c9515..e34264200e09 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4062,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
 			       const struct bpf_reg_state *reg,
 			       int regno, int off, int size,
 			       bool zero_size_allowed,
-			       const char *buf_info,
 			       u32 *max_access)
 {
+	const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
 	int err;
 
 	err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -4576,7 +4576,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 					      value_regno);
 	} else if (base_type(reg->type) == PTR_TO_BUF) {
 		bool rdonly_mem = type_is_rdonly_mem(reg->type);
-		const char *buf_info;
 		u32 *max_access;
 
 		if (rdonly_mem) {
@@ -4585,15 +4584,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 					regno, reg_type_str(env, reg->type));
 				return -EACCES;
 			}
-			buf_info = "rdonly";
 			max_access = &env->prog->aux->max_rdonly_access;
 		} else {
-			buf_info = "rdwr";
 			max_access = &env->prog->aux->max_rdwr_access;
 		}
 
 		err = check_buffer_access(env, reg, regno, off, size, false,
-					  buf_info, max_access);
+					  max_access);
 
 		if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
 			mark_reg_unknown(env, regs, value_regno);
@@ -4856,7 +4853,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
-	const char *buf_info;
 	u32 *max_access;
 
 	switch (base_type(reg->type)) {
@@ -4883,15 +4879,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 			if (meta && meta->raw_mode)
 				return -EACCES;
 
-			buf_info = "rdonly";
 			max_access = &env->prog->aux->max_rdonly_access;
 		} else {
-			buf_info = "rdwr";
 			max_access = &env->prog->aux->max_rdwr_access;
 		}
 		return check_buffer_access(env, reg, regno, reg->off,
 					   access_size, zero_size_allowed,
-					   buf_info, max_access);
+					   max_access);
 	case PTR_TO_STACK:
 		return check_stack_range_initialized(
 				env,
-- 
cgit v1.2.3


From eb77cf1c151c4a1c2147cbf24d84bcf0ba504e7c Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:28 +0100
Subject: sched/deadline: Remove unused def_dl_bandwidth

Since commit 1724813d9f2c ("sched/deadline: Remove the sysctl_sched_dl
knobs") the default deadline bandwidth control structure has no purpose.
Remove it.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-2-dietmar.eggemann@arm.com
---
 kernel/sched/core.c     | 1 -
 kernel/sched/deadline.c | 7 -------
 kernel/sched/sched.h    | 1 -
 3 files changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3aafc15da24a..d342c4c779f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9420,7 +9420,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
 	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
-	init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 62f0cf842277..ed4251fa87c7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,8 +18,6 @@
 #include "sched.h"
 #include "pelt.h"
 
-struct dl_bandwidth def_dl_bandwidth;
-
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
 	return container_of(dl_se, struct task_struct, dl);
@@ -423,12 +421,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
 void init_dl_bw(struct dl_bw *dl_b)
 {
 	raw_spin_lock_init(&dl_b->lock);
-	raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
 	if (global_rt_runtime() == RUNTIME_INF)
 		dl_b->bw = -1;
 	else
 		dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
-	raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
 	dl_b->total_bw = 0;
 }
 
@@ -2731,9 +2727,6 @@ void sched_dl_do_global(void)
 	int cpu;
 	unsigned long flags;
 
-	def_dl_bandwidth.dl_period = global_rt_period();
-	def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
 	if (global_rt_runtime() != RUNTIME_INF)
 		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3da5718cd641..a8b8516b8452 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2333,7 +2333,6 @@ extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 
-extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
-- 
cgit v1.2.3


From f1304ecbef3c9f4aec119ce2a07335d3a0bc55a6 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:29 +0100
Subject: sched/deadline: Move bandwidth mgmt and reclaim functions into sched
 class source file

Move the deadline bandwidth management (admission control) functions
__dl_add(), __dl_sub() and __dl_overflow() as well as the bandwidth
reclaim function __dl_update() from private task scheduler header file
to the deadline sched class source file.
The functions are only used internally so they don't have to be
exported.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-3-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h    | 49 -------------------------------------------------
 2 files changed, 44 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ed4251fa87c7..81bf97648e42 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -128,6 +128,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
 	rd->visit_gen = gen;
 	return false;
 }
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+	struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+	int i;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
+	for_each_cpu_and(i, rd->span, cpu_active_mask) {
+		struct rq *rq = cpu_rq(i);
+
+		rq->dl.extra_bw += bw;
+	}
+}
 #else
 static inline struct dl_bw *dl_bw_of(int i)
 {
@@ -148,8 +163,37 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
 {
 	return false;
 }
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+	struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+	dl->extra_bw += bw;
+}
 #endif
 
+static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+	dl_b->total_bw -= tsk_bw;
+	__dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+	dl_b->total_bw += tsk_bw;
+	__dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+	return dl_b->bw != -1 &&
+	       cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
 static inline
 void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a8b8516b8452..4dfc3b02df61 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -301,29 +301,6 @@ struct dl_bw {
 	u64			total_bw;
 };
 
-static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
-
-static inline
-void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
-	dl_b->total_bw -= tsk_bw;
-	__dl_update(dl_b, (s32)tsk_bw / cpus);
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
-	dl_b->total_bw += tsk_bw;
-	__dl_update(dl_b, -((s32)tsk_bw / cpus));
-}
-
-static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
-				 u64 old_bw, u64 new_bw)
-{
-	return dl_b->bw != -1 &&
-	       cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
-}
-
 /*
  * Verify the fitness of task @p to run on @cpu taking into account the
  * CPU original capacity and the runtime/deadline ratio of the task.
@@ -2748,32 +2725,6 @@ extern void nohz_run_idle_balance(int cpu);
 static inline void nohz_run_idle_balance(int cpu) { }
 #endif
 
-#ifdef CONFIG_SMP
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
-	struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
-	int i;
-
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-			 "sched RCU must be held");
-	for_each_cpu_and(i, rd->span, cpu_active_mask) {
-		struct rq *rq = cpu_rq(i);
-
-		rq->dl.extra_bw += bw;
-	}
-}
-#else
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
-	struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
-	dl->extra_bw += bw;
-}
-#endif
-
-
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
 	u64			total;
-- 
cgit v1.2.3


From 772b6539fdda31462cc08368e78df60b31a58bab Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:30 +0100
Subject: sched/deadline: Merge dl_task_can_attach() and dl_cpu_busy()

Both functions are doing almost the same, that is checking if admission
control is still respected.

With exclusive cpusets, dl_task_can_attach() checks if the destination
cpuset (i.e. its root domain) has enough CPU capacity to accommodate the
task.
dl_cpu_busy() checks if there is enough CPU capacity in the cpuset in
case the CPU is hot-plugged out.

dl_task_can_attach() is used to check if a task can be admitted while
dl_cpu_busy() is used to check if a CPU can be hotplugged out.

Make dl_cpu_busy() able to deal with a task and use it instead of
dl_task_can_attach() in task_can_attach().

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-4-dietmar.eggemann@arm.com
---
 kernel/sched/core.c     | 13 +++++++++----
 kernel/sched/deadline.c | 52 +++++++++++++------------------------------------
 kernel/sched/sched.h    |  3 +--
 3 files changed, 24 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d342c4c779f7..68736d1dc0f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8805,8 +8805,11 @@ int task_can_attach(struct task_struct *p,
 	}
 
 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_cpus_allowed))
-		ret = dl_task_can_attach(p, cs_cpus_allowed);
+					      cs_cpus_allowed)) {
+		int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
+		ret = dl_cpu_busy(cpu, p);
+	}
 
 out:
 	return ret;
@@ -9090,8 +9093,10 @@ static void cpuset_cpu_active(void)
 static int cpuset_cpu_inactive(unsigned int cpu)
 {
 	if (!cpuhp_tasks_frozen) {
-		if (dl_cpu_busy(cpu))
-			return -EBUSY;
+		int ret = dl_cpu_busy(cpu, NULL);
+
+		if (ret)
+			return ret;
 		cpuset_update_active_cpus();
 	} else {
 		num_cpus_frozen++;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 81bf97648e42..de677b1e3767 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2992,41 +2992,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 }
 
 #ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
-	unsigned long flags, cap;
-	unsigned int dest_cpu;
-	struct dl_bw *dl_b;
-	bool overflow;
-	int ret;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
-	rcu_read_lock_sched();
-	dl_b = dl_bw_of(dest_cpu);
-	raw_spin_lock_irqsave(&dl_b->lock, flags);
-	cap = dl_bw_capacity(dest_cpu);
-	overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
-	if (overflow) {
-		ret = -EBUSY;
-	} else {
-		/*
-		 * We reserve space for this task in the destination
-		 * root_domain, as we can't fail after this point.
-		 * We will free resources in the source root_domain
-		 * later on (see set_cpus_allowed_dl()).
-		 */
-		int cpus = dl_bw_cpus(dest_cpu);
-
-		__dl_add(dl_b, p->dl.dl_bw, cpus);
-		ret = 0;
-	}
-	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-	rcu_read_unlock_sched();
-
-	return ret;
-}
-
 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 				 const struct cpumask *trial)
 {
@@ -3048,7 +3013,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 	return ret;
 }
 
-bool dl_cpu_busy(unsigned int cpu)
+int dl_cpu_busy(int cpu, struct task_struct *p)
 {
 	unsigned long flags, cap;
 	struct dl_bw *dl_b;
@@ -3058,11 +3023,22 @@ bool dl_cpu_busy(unsigned int cpu)
 	dl_b = dl_bw_of(cpu);
 	raw_spin_lock_irqsave(&dl_b->lock, flags);
 	cap = dl_bw_capacity(cpu);
-	overflow = __dl_overflow(dl_b, cap, 0, 0);
+	overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
+
+	if (!overflow && p) {
+		/*
+		 * We reserve space for this task in the destination
+		 * root_domain, as we can't fail after this point.
+		 * We will free resources in the source root_domain
+		 * later on (see set_cpus_allowed_dl()).
+		 */
+		__dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+	}
+
 	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	rcu_read_unlock_sched();
 
-	return overflow;
+	return overflow ? -EBUSY : 0;
 }
 #endif
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4dfc3b02df61..0720cf0c7df1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -324,9 +324,8 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int  dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern bool dl_cpu_busy(unsigned int cpu);
+extern int  dl_cpu_busy(int cpu, struct task_struct *p);
 
 #ifdef CONFIG_CGROUP_SCHED
 
-- 
cgit v1.2.3


From f4478e7c855d2d6b2fde5126ebcca2cb5b34ee36 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:31 +0100
Subject: sched/deadline: Use __node_2_[pdl|dle]() and rb_first_cached()
 consistently

Deploy __node_2_pdl(node), __node_2_dle(node) and rb_first_cached()
consistently throughout the sched class source file which makes the
code at least easier to read.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-5-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index de677b1e3767..3242dd4972e1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -450,7 +450,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 
-	return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
 }
 
 static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
@@ -1433,6 +1433,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
 	timer->function = inactive_task_timer;
 }
 
+#define __node_2_dle(node) \
+	rb_entry((node), struct sched_dl_entity, rb_node)
+
 #ifdef CONFIG_SMP
 
 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1462,10 +1465,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 		cpudl_clear(&rq->rd->cpudl, rq->cpu);
 		cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 	} else {
-		struct rb_node *leftmost = dl_rq->root.rb_leftmost;
-		struct sched_dl_entity *entry;
+		struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+		struct sched_dl_entity *entry = __node_2_dle(leftmost);
 
-		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
 		dl_rq->earliest_dl.curr = entry->deadline;
 		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
 	}
@@ -1506,9 +1508,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	dec_dl_migration(dl_se, dl_rq);
 }
 
-#define __node_2_dle(node) \
-	rb_entry((node), struct sched_dl_entity, rb_node)
-
 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
 {
 	return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
@@ -1979,7 +1978,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	if (!left)
 		return NULL;
 
-	return rb_entry(left, struct sched_dl_entity, rb_node);
+	return __node_2_dle(left);
 }
 
 static struct task_struct *pick_task_dl(struct rq *rq)
@@ -2074,15 +2073,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
  */
 static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
 {
-	struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
 	struct task_struct *p = NULL;
+	struct rb_node *next_node;
 
 	if (!has_pushable_dl_tasks(rq))
 		return NULL;
 
+	next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+
 next_node:
 	if (next_node) {
-		p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+		p = __node_2_pdl(next_node);
 
 		if (pick_dl_task(rq, p, cpu))
 			return p;
@@ -2248,8 +2249,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 	if (!has_pushable_dl_tasks(rq))
 		return NULL;
 
-	p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
-		     struct task_struct, pushable_dl_tasks);
+	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-- 
cgit v1.2.3


From 71d29747b0e26f36a50e6a65dc0191ca742b9222 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:32 +0100
Subject: sched/deadline,rt: Remove unused functions for !CONFIG_SMP

The need_pull_[rt|dl]_task() and pull_[rt|dl]_task() functions are not
used on a !CONFIG_SMP system. Remove them.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-6-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c |  9 ---------
 kernel/sched/rt.c       | 11 -----------
 2 files changed, 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3242dd4972e1..93fcef57dd59 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -723,15 +723,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 }
 
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
-	return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
 static inline void deadline_queue_push_tasks(struct rq *rq)
 {
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 14f273c29518..b62e7652464b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -271,8 +271,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
-static void pull_rt_task(struct rq *this_rq);
-
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
@@ -429,15 +427,6 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 }
 
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
-{
-	return false;
-}
-
-static inline void pull_rt_task(struct rq *this_rq)
-{
-}
-
 static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
-- 
cgit v1.2.3


From 821aecd09e5ad2f8d4c3d8195333d272b392f7d3 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 2 Mar 2022 19:34:33 +0100
Subject: sched/deadline,rt: Remove unused parameter from
 pick_next_[rt|dl]_entity()

The `struct rq *rq` parameter isn't used. Remove it.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220302183433.333029-7-dietmar.eggemann@arm.com
---
 kernel/sched/deadline.c | 5 ++---
 kernel/sched/rt.c       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 93fcef57dd59..11cdc6d0c45f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1961,8 +1961,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
 	deadline_queue_push_tasks(rq);
 }
 
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
-						   struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
 {
 	struct rb_node *left = rb_first_cached(&dl_rq->root);
 
@@ -1981,7 +1980,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
 	if (!sched_dl_runnable(rq))
 		return NULL;
 
-	dl_se = pick_next_dl_entity(rq, dl_rq);
+	dl_se = pick_next_dl_entity(dl_rq);
 	BUG_ON(!dl_se);
 	p = dl_task_of(dl_se);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b62e7652464b..67039e5d359b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1719,8 +1719,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
 	rt_queue_push_tasks(rq);
 }
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-						   struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
 {
 	struct rt_prio_array *array = &rt_rq->active;
 	struct sched_rt_entity *next = NULL;
@@ -1742,7 +1741,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
-		rt_se = pick_next_rt_entity(rq, rt_rq);
+		rt_se = pick_next_rt_entity(rt_rq);
 		BUG_ON(!rt_se);
 		rt_rq = group_rt_rq(rt_se);
 	} while (rt_rq);
-- 
cgit v1.2.3


From 7f434dff76215af00c26ba6449eaa4738fe9e2ab Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Fri, 18 Feb 2022 21:57:43 +0530
Subject: sched/topology: Remove redundant variable and fix incorrect type in
 build_sched_domains

While investigating the sparse warning reported by the LKP bot [1],
observed that we have a redundant variable "top" in the function
build_sched_domains that was introduced in the recent commit
e496132ebedd ("sched/fair: Adjust the allowed NUMA imbalance when
SD_NUMA spans multiple LLCs")

The existing variable "sd" suffices which allows us to remove the
redundant variable "top" while annotating the other variable "top_p"
with the "__rcu" annotation to silence the sparse warning.

[1] https://lore.kernel.org/lkml/202202170853.9vofgC3O-lkp@intel.com/

Fixes: e496132ebedd ("sched/fair: Adjust the allowed NUMA imbalance when SD_NUMA spans multiple LLCs")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lore.kernel.org/r/20220218162743.1134-1-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32841c6741d1..43f2899fe0ef 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2291,7 +2291,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 			if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
 			    (child->flags & SD_SHARE_PKG_RESOURCES)) {
-				struct sched_domain *top, *top_p;
+				struct sched_domain __rcu *top_p;
 				unsigned int nr_llcs;
 
 				/*
@@ -2316,11 +2316,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 				sd->imb_numa_nr = imb;
 
 				/* Set span based on the first NUMA domain. */
-				top = sd;
-				top_p = top->parent;
+				top_p = sd->parent;
 				while (top_p && !(top_p->flags & SD_NUMA)) {
-					top = top->parent;
-					top_p = top->parent;
+					top_p = top_p->parent;
 				}
 				imb_span = top_p ? top_p->span_weight : sd->span_weight;
 			} else {
-- 
cgit v1.2.3


From c57bef0287dd5deeabaea5727950559fb9037cd9 Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Thu, 6 Jan 2022 12:20:40 -0500
Subject: prlimit: make do_prlimit() static

There are no other callers in the kernel.

Fixed up a comment format and whitespace issue when moving do_prlimit()
higher in sys.c.

Signed-off-by: Barret Rhoden <brho@google.com>
Link: https://lkml.kernel.org/r/20220106172041.522167-3-brho@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sys.c | 116 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5b0e172c4d47..d37d1f882b7d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1424,6 +1424,65 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 	return errno;
 }
 
+/* make sure you are allowed to change @tsk limits before calling this */
+static int do_prlimit(struct task_struct *tsk, unsigned int resource,
+		      struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+	struct rlimit *rlim;
+	int retval = 0;
+
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+	if (new_rlim) {
+		if (new_rlim->rlim_cur > new_rlim->rlim_max)
+			return -EINVAL;
+		if (resource == RLIMIT_NOFILE &&
+				new_rlim->rlim_max > sysctl_nr_open)
+			return -EPERM;
+	}
+
+	/* protect tsk->signal and tsk->sighand from disappearing */
+	read_lock(&tasklist_lock);
+	if (!tsk->sighand) {
+		retval = -ESRCH;
+		goto out;
+	}
+
+	rlim = tsk->signal->rlim + resource;
+	task_lock(tsk->group_leader);
+	if (new_rlim) {
+		/*
+		 * Keep the capable check against init_user_ns until cgroups can
+		 * contain all limits.
+		 */
+		if (new_rlim->rlim_max > rlim->rlim_max &&
+				!capable(CAP_SYS_RESOURCE))
+			retval = -EPERM;
+		if (!retval)
+			retval = security_task_setrlimit(tsk, resource, new_rlim);
+	}
+	if (!retval) {
+		if (old_rlim)
+			*old_rlim = *rlim;
+		if (new_rlim)
+			*rlim = *new_rlim;
+	}
+	task_unlock(tsk->group_leader);
+
+	/*
+	 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+	 * infinite. In case of RLIM_INFINITY the posix CPU timer code
+	 * ignores the rlimit.
+	 */
+	if (!retval && new_rlim && resource == RLIMIT_CPU &&
+	    new_rlim->rlim_cur != RLIM_INFINITY &&
+	    IS_ENABLED(CONFIG_POSIX_TIMERS))
+		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
+out:
+	read_unlock(&tasklist_lock);
+	return retval;
+}
+
 SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 {
 	struct rlimit value;
@@ -1567,63 +1626,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
 		rlim->rlim_max = (unsigned long)rlim64->rlim_max;
 }
 
-/* make sure you are allowed to change @tsk limits before calling this */
-int do_prlimit(struct task_struct *tsk, unsigned int resource,
-		struct rlimit *new_rlim, struct rlimit *old_rlim)
-{
-	struct rlimit *rlim;
-	int retval = 0;
-
-	if (resource >= RLIM_NLIMITS)
-		return -EINVAL;
-	if (new_rlim) {
-		if (new_rlim->rlim_cur > new_rlim->rlim_max)
-			return -EINVAL;
-		if (resource == RLIMIT_NOFILE &&
-				new_rlim->rlim_max > sysctl_nr_open)
-			return -EPERM;
-	}
-
-	/* protect tsk->signal and tsk->sighand from disappearing */
-	read_lock(&tasklist_lock);
-	if (!tsk->sighand) {
-		retval = -ESRCH;
-		goto out;
-	}
-
-	rlim = tsk->signal->rlim + resource;
-	task_lock(tsk->group_leader);
-	if (new_rlim) {
-		/* Keep the capable check against init_user_ns until
-		   cgroups can contain all limits */
-		if (new_rlim->rlim_max > rlim->rlim_max &&
-				!capable(CAP_SYS_RESOURCE))
-			retval = -EPERM;
-		if (!retval)
-			retval = security_task_setrlimit(tsk, resource, new_rlim);
-	}
-	if (!retval) {
-		if (old_rlim)
-			*old_rlim = *rlim;
-		if (new_rlim)
-			*rlim = *new_rlim;
-	}
-	task_unlock(tsk->group_leader);
-
-	/*
-	 * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
-	 * infinite. In case of RLIM_INFINITY the posix CPU timer code
-	 * ignores the rlimit.
-	 */
-	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
-	     new_rlim->rlim_cur != RLIM_INFINITY &&
-	     IS_ENABLED(CONFIG_POSIX_TIMERS))
-		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
-	read_unlock(&tasklist_lock);
-	return retval;
-}
-
 /* rcu lock must be held */
 static int check_prlimit_permission(struct task_struct *task,
 				    unsigned int flags)
-- 
cgit v1.2.3


From 18c91bb2d87268d23868bf13508f5bc9cf04e89a Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Thu, 6 Jan 2022 12:20:41 -0500
Subject: prlimit: do not grab the tasklist_lock

Unnecessarily grabbing the tasklist_lock can be a scalability bottleneck
for workloads that also must grab the tasklist_lock for waiting,
killing, and cloning.

The tasklist_lock was grabbed to protect tsk->sighand from disappearing
(becoming NULL).  tsk->signal was already protected by holding a
reference to tsk.

update_rlimit_cpu() assumed tsk->sighand != NULL.  With this commit, it
attempts to lock_task_sighand().  However, this means that
update_rlimit_cpu() can fail.  This only happens when a task is exiting.
Note that during exec, sighand may *change*, but it will not be NULL.

Prior to this commit, the do_prlimit() ensured that update_rlimit_cpu()
would not fail by read locking the tasklist_lock and checking tsk->sighand
!= NULL.

If update_rlimit_cpu() fails, there may be other tasks that are not
exiting that share tsk->signal.  However, the group_leader is the last
task to be released, so if we cannot update_rlimit_cpu(group_leader),
then the entire process is exiting.

The only other caller of update_rlimit_cpu() is
selinux_bprm_committing_creds().  It has tsk == current, so
update_rlimit_cpu() cannot fail (current->sighand cannot disappear
until current exits).

This change resulted in a 14% speedup on a microbenchmark where parents
kill and wait on their children, and children getpriority, setpriority,
and getrlimit.

Signed-off-by: Barret Rhoden <brho@google.com>
Link: https://lkml.kernel.org/r/20220106172041.522167-4-brho@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/sys.c                   | 25 ++++++++++++++-----------
 kernel/time/posix-cpu-timers.c | 12 +++++++++---
 2 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index d37d1f882b7d..374f83e95239 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1441,13 +1441,7 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
 			return -EPERM;
 	}
 
-	/* protect tsk->signal and tsk->sighand from disappearing */
-	read_lock(&tasklist_lock);
-	if (!tsk->sighand) {
-		retval = -ESRCH;
-		goto out;
-	}
-
+	/* Holding a refcount on tsk protects tsk->signal from disappearing. */
 	rlim = tsk->signal->rlim + resource;
 	task_lock(tsk->group_leader);
 	if (new_rlim) {
@@ -1476,10 +1470,19 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	 */
 	if (!retval && new_rlim && resource == RLIMIT_CPU &&
 	    new_rlim->rlim_cur != RLIM_INFINITY &&
-	    IS_ENABLED(CONFIG_POSIX_TIMERS))
-		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
-	read_unlock(&tasklist_lock);
+	    IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+		/*
+		 * update_rlimit_cpu can fail if the task is exiting, but there
+		 * may be other tasks in the thread group that are not exiting,
+		 * and they need their cpu timers adjusted.
+		 *
+		 * The group_leader is the last task to be released, so if we
+		 * cannot update_rlimit_cpu on it, then the entire process is
+		 * exiting and we do not need to update at all.
+		 */
+		update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
+	}
+
 	return retval;
 }
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..e13e628509fb 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -34,14 +34,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
  * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
  * necessary. Needs siglock protection since other code may update the
  * expiration cache as well.
+ *
+ * Returns 0 on success, -ESRCH on failure.  Can fail if the task is exiting and
+ * we cannot lock_task_sighand.  Cannot fail if task is current.
  */
-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
 {
 	u64 nsecs = rlim_new * NSEC_PER_SEC;
+	unsigned long irq_fl;
 
-	spin_lock_irq(&task->sighand->siglock);
+	if (!lock_task_sighand(task, &irq_fl))
+		return -ESRCH;
 	set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
-	spin_unlock_irq(&task->sighand->siglock);
+	unlock_task_sighand(task, &irq_fl);
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From eba0549bc7d100691c13384b774346b8aa9cf9a9 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Fri, 25 Feb 2022 16:18:30 -0800
Subject: xfs: don't generate selinux audit messages for capability testing

There are a few places where we test the current process' capability set
to decide if we're going to be more or less generous with resource
acquisition for a system call.  If the process doesn't have the
capability, we can continue the call, albeit in a degraded mode.

These are /not/ the actual security decisions, so it's not proper to use
capable(), which (in certain selinux setups) causes audit messages to
get logged.  Switch them to has_capability_noaudit.

Fixes: 7317a03df703f ("xfs: refactor inode ownership change transaction/inode/quota allocation idiom")
Fixes: ea9a46e1c4925 ("xfs: only return detailed fsmap info if the caller has CAP_SYS_ADMIN")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 kernel/capability.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..765194f5d678 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 {
 	return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
+EXPORT_SYMBOL(has_capability_noaudit);
 
 static bool ns_capable_common(struct user_namespace *ns,
 			      int cap,
-- 
cgit v1.2.3


From b530e9e1063ed2b817eae7eec6ed2daa8be11608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Wed, 9 Mar 2022 11:53:42 +0100
Subject: bpf: Add "live packet" mode for XDP in BPF_PROG_RUN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for running XDP programs through BPF_PROG_RUN in a mode
that enables live packet processing of the resulting frames. Previous uses
of BPF_PROG_RUN for XDP returned the XDP program return code and the
modified packet data to userspace, which is useful for unit testing of XDP
programs.

The existing BPF_PROG_RUN for XDP allows userspace to set the ingress
ifindex and RXQ number as part of the context object being passed to the
kernel. This patch reuses that code, but adds a new mode with different
semantics, which can be selected with the new BPF_F_TEST_XDP_LIVE_FRAMES
flag.

When running BPF_PROG_RUN in this mode, the XDP program return codes will
be honoured: returning XDP_PASS will result in the frame being injected
into the networking stack as if it came from the selected networking
interface, while returning XDP_TX and XDP_REDIRECT will result in the frame
being transmitted out that interface. XDP_TX is translated into an
XDP_REDIRECT operation to the same interface, since the real XDP_TX action
is only possible from within the network drivers themselves, not from the
process context where BPF_PROG_RUN is executed.

Internally, this new mode of operation creates a page pool instance while
setting up the test run, and feeds pages from that into the XDP program.
The setup cost of this is amortised over the number of repetitions
specified by userspace.

To support the performance testing use case, we further optimise the setup
step so that all pages in the pool are pre-initialised with the packet
data, and pre-computed context and xdp_frame objects stored at the start of
each page. This makes it possible to entirely avoid touching the page
content on each XDP program invocation, and enables sending up to 9
Mpps/core on my test box.

Because the data pages are recycled by the page pool, and the test runner
doesn't re-initialise them for each run, subsequent invocations of the XDP
program will see the packet data in the state it was after the last time it
ran on that particular page. This means that an XDP program that modifies
the packet before redirecting it has to be careful about which assumptions
it makes about the packet content, but that is only an issue for the most
naively written programs.

Enabling the new flag is only allowed when not setting ctx_out and data_out
in the test specification, since using it means frames will be redirected
somewhere else, so they can't be returned.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220309105346.100053-2-toke@redhat.com
---
 kernel/bpf/Kconfig   | 1 +
 kernel/bpf/syscall.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index c3cf0b86eeb2..d56ee177d5f8 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -30,6 +30,7 @@ config BPF_SYSCALL
 	select TASKS_TRACE_RCU
 	select BINARY_PRINTF
 	select NET_SOCK_MSG if NET
+	select PAGE_POOL if NET
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index db402ebc5570..9beb585be5a6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3336,7 +3336,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	}
 }
 
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
 
 static int bpf_prog_test_run(const union bpf_attr *attr,
 			     union bpf_attr __user *uattr)
-- 
cgit v1.2.3


From 8ddde07a3d285a0f3cec14924446608320fdc013 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Tue, 8 Mar 2022 16:59:10 +0800
Subject: dma-mapping: benchmark: extract a common header file for
 map_benchmark definition

kernel/dma/map_benchmark.c and selftests/dma/dma_map_benchmark.c
have duplicate map_benchmark definitions, which tends to lead to
inconsistent changes to map_benchmark on both sides, extract a
common header file to avoid this problem.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/map_benchmark.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..0520a8f4fb1d 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <linux/map_benchmark.h>
 #include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -18,30 +19,6 @@
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
 
-#define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS	1024
-#define DMA_MAP_MAX_SECONDS	300
-#define DMA_MAP_MAX_TRANS_DELAY	(10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL	0
-#define DMA_MAP_TO_DEVICE	1
-#define DMA_MAP_FROM_DEVICE	2
-
-struct map_benchmark {
-	__u64 avg_map_100ns; /* average map latency in 100ns */
-	__u64 map_stddev; /* standard deviation of map latency */
-	__u64 avg_unmap_100ns; /* as above */
-	__u64 unmap_stddev;
-	__u32 threads; /* how many threads will do map/unmap in parallel */
-	__u32 seconds; /* how long the test will last */
-	__s32 node; /* which numa node this benchmark will run on */
-	__u32 dma_bits; /* DMA addressing capability */
-	__u32 dma_dir; /* DMA data direction */
-	__u32 dma_trans_ns; /* time for DMA transmission in ns */
-	__u32 granule;	/* how many PAGE_SIZE will do map/unmap once a time */
-	__u8 expansion[76];	/* For future use */
-};
-
 struct map_benchmark_data {
 	struct map_benchmark bparam;
 	struct device *dev;
-- 
cgit v1.2.3


From 153474ba1a4aed0a7b797b4c2be8c35c7a4e57bd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 11:46:37 -0600
Subject: ptrace: Create ptrace_report_syscall_{entry,exit} in ptrace.h

Rename tracehook_report_syscall_{entry,exit} to
ptrace_report_syscall_{entry,exit} and place them in ptrace.h

There is no longer any generic tracehook infractructure so make
these ptrace specific functions ptrace specific.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-3-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/entry/common.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..f52e57c4d6d8 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
+#include <linux/tracehook.h>
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
-- 
cgit v1.2.3


From 0cfcb2b9ef48bbcaf5d43b9f1893f63a938e8176 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:00:55 -0600
Subject: ptrace: Remove arch_syscall_{enter,exit}_tracehook

These functions are alwasy one-to-one wrappers around
ptrace_report_syscall_entry and ptrace_report_syscall_exit.
So directly call the functions they are wrapping instead.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-4-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/entry/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f52e57c4d6d8..f0b1daa1e8da 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -59,7 +59,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
-		ret = arch_syscall_enter_tracehook(regs);
+		ret = ptrace_report_syscall_entry(regs);
 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
 			return -1L;
 	}
@@ -253,7 +253,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 
 	step = report_single_step(work);
 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
-		arch_syscall_exit_tracehook(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 /*
-- 
cgit v1.2.3


From c145137dc990fd67b52fbc52faae5ba46f168cca Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:04:27 -0600
Subject: ptrace: Remove tracehook_signal_handler

The two line function tracehook_signal_handler is only called from
signal_delivered.  Expand it inline in signal_delivered and remove it.
Just to make it easier to understand what is going on.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 38602738866e..0e0bd1c1068b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2898,7 +2898,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	set_current_blocked(&blocked);
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		sas_ss_reset(current);
-	tracehook_signal_handler(stepping);
+	if (stepping)
+		ptrace_notify(SIGTRAP);
 }
 
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
-- 
cgit v1.2.3


From 8ca07e17c9dd4c4afcb4a3f2ea8f0a0d41c0f982 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Jan 2022 13:55:47 -0600
Subject: task_work: Remove unnecessary include from posix_timers.h

Break a header file circular dependency by removing the unnecessary
include of task_work.h from posix_timers.h.

sched.h -> posix-timers.h
posix-timers.h -> task_work.h
task_work.h -> sched.h

Add missing includes of task_work.h to:
arch/x86/mm/tlb.c
kernel/time/posix-cpu-timers.c

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-6-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/time/posix-cpu-timers.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..9190d9eb236d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,7 @@
 #include <linux/workqueue.h>
 #include <linux/compat.h>
 #include <linux/sched/deadline.h>
+#include <linux/task_work.h>
 
 #include "posix-timers.h"
 
-- 
cgit v1.2.3


From 7f62d40d9cb50fd146fe8ff071f98fa3c1855083 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 08:52:41 -0600
Subject: task_work: Introduce task_work_pending

Wrap the test of task->task_works in a helper function to make
it clear what is being tested.

All of the other readers of task->task_work use READ_ONCE and this is
even necessary on current as other processes can update
task->task_work.  So for consistency I have added READ_ONCE into
task_work_pending.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-7-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c    | 4 ++--
 kernel/task_work.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0e0bd1c1068b..3b4cf25fb9b3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2344,7 +2344,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-	if (unlikely(current->task_works))
+	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
@@ -2626,7 +2626,7 @@ bool get_signal(struct ksignal *ksig)
 	struct signal_struct *signal = current->signal;
 	int signr;
 
-	if (unlikely(current->task_works))
+	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	/*
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 1698fbe6f0e1..cc6fccb0e24d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -78,7 +78,7 @@ task_work_cancel_match(struct task_struct *task,
 	struct callback_head *work;
 	unsigned long flags;
 
-	if (likely(!task->task_works))
+	if (likely(!task_work_pending(task)))
 		return NULL;
 	/*
 	 * If cmpxchg() fails we continue without updating pprev.
-- 
cgit v1.2.3


From 1b773d0003aa90e60953d263c2ee31e6fb4a8a69 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Fri, 11 Mar 2022 00:15:18 +0800
Subject: bpf: Use offsetofend() to simplify macro definition

Use offsetofend() instead of offsetof() + sizeof() to simplify
MIN_BPF_LINEINFO_SIZE macro definition.

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/bpf/20220310161518.534544-1-ytcoode@gmail.com
---
 kernel/bpf/verifier.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e34264200e09..0db6cd8dcb35 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10400,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 		aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
-#define MIN_BPF_LINEINFO_SIZE	(offsetof(struct bpf_line_info, line_col) + \
-		sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE	offsetofend(struct bpf_line_info, line_col)
 #define MAX_LINEINFO_REC_SIZE	MAX_FUNCINFO_REC_SIZE
 
 static int check_btf_line(struct bpf_verifier_env *env,
-- 
cgit v1.2.3


From 173c2049d12b441b498d6423276f5dd76b1e637b Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 24 Feb 2022 10:16:37 -0800
Subject: user_events: Fix potential uninitialized pointer while parsing field

Ensure name is initialized by default to NULL to prevent possible edge
cases that could lead to it being left uninitialized. Add an explicit
check for NULL name to ensure edge boundaries.

Link: https://lore.kernel.org/bpf/20220224105334.GA2248@kili/
Link: https://lore.kernel.org/linux-trace-devel/20220224181637.2129-1-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 2b5e9fdb63a0..9a6191a6a786 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -362,6 +362,8 @@ skip_next:
 	*field++ = '\0';
 	depth++;
 parse:
+	name = NULL;
+
 	while ((part = strsep(&field, " ")) != NULL) {
 		switch (depth++) {
 		case FIELD_DEPTH_TYPE:
@@ -382,7 +384,7 @@ parse:
 		}
 	}
 
-	if (depth < FIELD_DEPTH_SIZE)
+	if (depth < FIELD_DEPTH_SIZE || !name)
 		return -EINVAL;
 
 	if (depth == FIELD_DEPTH_SIZE)
-- 
cgit v1.2.3


From 5677a3d713ad795e3ce32d631941e982777b43db Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 8 Mar 2022 10:39:45 -0500
Subject: tracing: Fix last_cmd_set() string management in histogram code

Using strnlen(dest, str, n) is confusing, as the size of dest must be
strlen(dest) + n + 1. Even more confusing, using sizeof(string constant)
gives you strlen(string constant) + 1 and not just strlen().

These two together made using strncat() with a constant string a bit off
in the calculations as we have:

	len = sizeof(HIST_PREFIX) + strlen(str) + 1;
	kfree(last_cmd);
	last_cmd = kzalloc(len, GFP_KERNEL);
	strcpy(last_cmd, HIST_PREFIX);
	len -= sizeof(HIST_PREFIX) + 1;
	strncat(last_cmd, str, len);

The above works if we s/sizeof/strlen/ with HIST_PREFIX (which is defined
as "hist:", but because sizeof(HIST_PREFIX) is equal to
strlen(HIST_PREFIX) + 1, we can drop the +1 in the code. But at least
comment that we are doing so.

Link: https://lore.kernel.org/all/202203082112.Iu7tvFl4-lkp@intel.com/

Fixes: 9f8e5aee93ed2 ("tracing: Fix allocation of last_cmd in last_cmd_set()")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 78788049f3d3..954b19e2f196 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -749,14 +749,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 	if (!str)
 		return;
 
-	len = sizeof(HIST_PREFIX) + strlen(str) + 1;
+	/* sizeof() contains the nul byte */
+	len = sizeof(HIST_PREFIX) + strlen(str);
 	kfree(last_cmd);
 	last_cmd = kzalloc(len, GFP_KERNEL);
 	if (!last_cmd)
 		return;
 
 	strcpy(last_cmd, HIST_PREFIX);
-	len -= sizeof(HIST_PREFIX) + 1;
+	/* Again, sizeof() contains the nul byte */
+	len -= sizeof(HIST_PREFIX);
 	strncat(last_cmd, str, len);
 
 	if (file) {
-- 
cgit v1.2.3


From 8bcd06632b73291e8734f46084ed04f4106b840a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 3 Mar 2022 17:05:31 -0500
Subject: tracing: Allow custom events to be added to the tracefs directory

Allow custom events to be added to the events directory in the tracefs
file system. For example, a module could be installed that attaches to an
event and wants to be enabled and disabled via the tracefs file system. It
would use trace_add_event_call() to add the event to the tracefs
directory, and trace_remove_event_call() to remove it.

Make both those functions EXPORT_SYMBOL_GPL().

Link: https://lkml.kernel.org/r/20220303220625.186988045@goodmis.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..38afd66d80e3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2758,6 +2758,7 @@ int trace_add_event_call(struct trace_event_call *call)
 	mutex_unlock(&trace_types_lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(trace_add_event_call);
 
 /*
  * Must be called under locking of trace_types_lock, event_mutex and
@@ -2819,6 +2820,7 @@ int trace_remove_event_call(struct trace_event_call *call)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
 
 #define for_each_event(event, start, end)			\
 	for (event = start;					\
-- 
cgit v1.2.3


From 8ba62d37949e248c698c26e0d82d72fda5d33ebf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 09:51:14 -0600
Subject: task_work: Call tracehook_notify_signal from get_signal on all
 architectures

Always handle TIF_NOTIFY_SIGNAL in get_signal.  With commit 35d0b389f3b2
("task_work: unconditionally run task_work from get_signal()") always
calling task_work_run all of the work of tracehook_notify_signal is
already happening except clearing TIF_NOTIFY_SIGNAL.

Factor clear_notify_signal out of tracehook_notify_signal and use it in
get_signal so that get_signal only needs one call of task_work_run.

To keep the semantics in sync update xfer_to_guest_mode_work (which
does not call get_signal) to call tracehook_notify_signal if either
_TIF_SIGPENDING or _TIF_NOTIFY_SIGNAL.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-8-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/entry/common.c | 12 ++----------
 kernel/entry/kvm.c    |  2 +-
 kernel/signal.c       | 14 +++-----------
 3 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f0b1daa1e8da..79eaf9b4b10d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -139,15 +139,7 @@ void noinstr exit_to_user_mode(void)
 }
 
 /* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
-	if (ti_work & _TIF_NOTIFY_SIGNAL)
-		tracehook_notify_signal();
-
-	arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 					    unsigned long ti_work)
@@ -170,7 +162,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 			klp_update_patch_state(current);
 
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			handle_signal_work(regs, ti_work);
+			arch_do_signal_or_restart(regs);
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(regs);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 96d476e06c77..cabf36a489e4 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,7 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 	do {
 		int ret;
 
-		if (ti_work & _TIF_NOTIFY_SIGNAL)
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 			tracehook_notify_signal();
 
 		if (ti_work & _TIF_SIGPENDING) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 3b4cf25fb9b3..8632b88982c9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2626,20 +2626,12 @@ bool get_signal(struct ksignal *ksig)
 	struct signal_struct *signal = current->signal;
 	int signr;
 
+	clear_notify_signal();
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
-	/*
-	 * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
-	 * that the arch handlers don't all have to do it. If we get here
-	 * without TIF_SIGPENDING, just exit after running signal work.
-	 */
-	if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
-		if (test_thread_flag(TIF_NOTIFY_SIGNAL))
-			tracehook_notify_signal();
-		if (!task_sigpending(current))
-			return false;
-	}
+	if (!task_sigpending(current))
+		return false;
 
 	if (unlikely(uprobe_deny_signal()))
 		return false;
-- 
cgit v1.2.3


From 7c5d8fa6fbb12a3f0eefe8762bfede508e147cb3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 11:18:54 -0600
Subject: task_work: Decouple TIF_NOTIFY_SIGNAL and task_work

There are a small handful of reasons besides pending signals that the
kernel might want to break out of interruptible sleeps.  The flag
TIF_NOTIFY_SIGNAL and the helpers that set and clear TIF_NOTIFY_SIGNAL
provide that the infrastructure for breaking out of interruptible
sleeps and entering the return to user space slow path for those
cases.

Expand tracehook_notify_signal inline in it's callers and remove it,
which makes clear that TIF_NOTIFY_SIGNAL and task_work are separate
concepts.

Update the comment on set_notify_signal to more accurately describe
it's purpose.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-9-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/entry/kvm.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index cabf36a489e4..3ab5f98988c3 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 	do {
 		int ret;
 
-		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			tracehook_notify_signal();
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+			clear_notify_signal();
+			if (task_work_pending(current))
+				task_work_run();
+		}
 
 		if (ti_work & _TIF_SIGPENDING) {
 			kvm_handle_signal_exit(vcpu);
-- 
cgit v1.2.3


From 03248addadf1a5ef0a03cbcd5ec905b49adb9658 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 12:20:45 -0600
Subject: resume_user_mode: Move to resume_user_mode.h

Move set_notify_resume and tracehook_notify_resume into resume_user_mode.h.
While doing that rename tracehook_notify_resume to resume_user_mode_work.

Update all of the places that included tracehook.h for these functions to
include resume_user_mode.h instead.

Update all of the callers of tracehook_notify_resume to call
resume_user_mode_work.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-12-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/entry/common.c | 4 ++--
 kernel/entry/kvm.c    | 2 +-
 kernel/task_work.c    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 79eaf9b4b10d..a86823cad853 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,7 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
@@ -165,7 +165,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 			arch_do_signal_or_restart(regs);
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
-			tracehook_notify_resume(regs);
+			resume_user_mode_work(regs);
 
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 3ab5f98988c3..9d09f489b60e 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -23,7 +23,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			schedule();
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
-			tracehook_notify_resume(NULL);
+			resume_user_mode_work(NULL);
 
 		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
 		if (ret)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index cc6fccb0e24d..c59e1a49bc40 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/spinlock.h>
 #include <linux/task_work.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
-- 
cgit v1.2.3


From 355f841a3f8ca980c9682937a5257d3a1f6fc09d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 12:47:08 -0600
Subject: tracehook: Remove tracehook.h

Now that all of the definitions have moved out of tracehook.h into
ptrace.h, sched/signal.h, resume_user_mode.h there is nothing left in
tracehook.h so remove it.

Update the few files that were depending upon tracehook.h to bring in
definitions to use the headers they need directly.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-13-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/exit.c                 | 3 ++-
 kernel/livepatch/transition.c | 1 -
 kernel/seccomp.c              | 1 -
 kernel/signal.c               | 2 +-
 4 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..9326d1f97fc7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,8 @@
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..df808d97d84f 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,7 +9,6 @@
 
 #include <linux/cpu.h>
 #include <linux/stacktrace.h>
-#include <linux/tracehook.h>
 #include "core.h"
 #include "patch.h"
 #include "transition.h"
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4d8f44a17727..63198086ee83 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,6 @@
 #include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/capability.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
 #include <linux/lockdep.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 8632b88982c9..c2dee5420567 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,7 +32,7 @@
 #include <linux/signal.h>
 #include <linux/signalfd.h>
 #include <linux/ratelimit.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
-- 
cgit v1.2.3


From 174b16946e39ebd369097e0f773536c91a8c1a4c Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 2 Mar 2022 12:13:58 +0100
Subject: bpf-lsm: Introduce new helper bpf_ima_file_hash()

ima_file_hash() has been modified to calculate the measurement of a file on
demand, if it has not been already performed by IMA or the measurement is
not fresh. For compatibility reasons, ima_inode_hash() remains unchanged.

Keep the same approach in eBPF and introduce the new helper
bpf_ima_file_hash() to take advantage of the modified behavior of
ima_file_hash().

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220302111404.193900-4-roberto.sassu@huawei.com
---
 kernel/bpf/bpf_lsm.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9e4ecc990647..e8d27af5bbcc 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
 	.allowed	= bpf_ima_inode_hash_allowed,
 };
 
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+	return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+	.func		= bpf_ima_file_hash,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg1_btf_id	= &bpf_ima_file_hash_btf_ids[0],
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.allowed	= bpf_ima_inode_hash_allowed,
+};
+
 static const struct bpf_func_proto *
 bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_bprm_opts_set_proto;
 	case BPF_FUNC_ima_inode_hash:
 		return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+	case BPF_FUNC_ima_file_hash:
+		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3


From df6b3039fa112e17555776213cab7f07c0a8d98d Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Wed, 2 Mar 2022 12:14:02 +0100
Subject: bpf-lsm: Make bpf_lsm_kernel_read_file() as sleepable

Make bpf_lsm_kernel_read_file() as sleepable, so that bpf_ima_inode_hash()
or bpf_ima_file_hash() can be called inside the implementation of this
hook.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220302111404.193900-8-roberto.sassu@huawei.com
---
 kernel/bpf/bpf_lsm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e8d27af5bbcc..064eccba641d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -187,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
 BTF_ID(func, bpf_lsm_inode_symlink)
 BTF_ID(func, bpf_lsm_inode_unlink)
 BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
 BTF_ID(func, bpf_lsm_kernfs_init_security)
 
 #ifdef CONFIG_KEYS
-- 
cgit v1.2.3


From 7e348b325bc40eb52aead4d57a1f90d33ea834fc Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Wed, 9 Mar 2022 16:11:41 -0800
Subject: user_events: Prevent dyn_event delete racing with ioctl add/delete

Find user_events always while under the event_mutex and before leaving
the lock, add a ref count to the user_event. This ensures that all paths
under the event_mutex that check the ref counts will be synchronized.

The ioctl add/delete paths are protected by the reg_mutex. However,
dyn_event is only protected by the event_mutex. The dyn_event delete
path cannot acquire reg_mutex, since that could cause a deadlock between
the ioctl delete case acquiring event_mutex after acquiring the reg_mutex.

Link: https://lkml.kernel.org/r/20220310001141.1660-1-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 46 ++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9a6191a6a786..4febc1d6ae72 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -135,6 +135,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
  * NOTE: Offsets are from the user data perspective, they are not from the
  * trace_entry/buffer perspective. We automatically add the common properties
  * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
  */
 static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
 {
@@ -593,8 +595,10 @@ static struct user_event *find_user_event(char *name, u32 *outkey)
 	*outkey = key;
 
 	hash_for_each_possible(register_table, user, node, key)
-		if (!strcmp(EVENT_NAME(user), name))
+		if (!strcmp(EVENT_NAME(user), name)) {
+			atomic_inc(&user->refcnt);
 			return user;
+		}
 
 	return NULL;
 }
@@ -883,7 +887,12 @@ static int user_event_create(const char *raw_command)
 		return -ENOMEM;
 
 	mutex_lock(&reg_mutex);
+
 	ret = user_event_parse_cmd(name, &user);
+
+	if (!ret)
+		atomic_dec(&user->refcnt);
+
 	mutex_unlock(&reg_mutex);
 
 	if (ret)
@@ -1050,6 +1059,7 @@ static int user_event_trace_register(struct user_event *user)
 /*
  * Parses the event name, arguments and flags then registers if successful.
  * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
  */
 static int user_event_parse(char *name, char *args, char *flags,
 			    struct user_event **newuser)
@@ -1057,7 +1067,12 @@ static int user_event_parse(char *name, char *args, char *flags,
 	int ret;
 	int index;
 	u32 key;
-	struct user_event *user = find_user_event(name, &key);
+	struct user_event *user;
+
+	/* Prevent dyn_event from racing */
+	mutex_lock(&event_mutex);
+	user = find_user_event(name, &key);
+	mutex_unlock(&event_mutex);
 
 	if (user) {
 		*newuser = user;
@@ -1121,6 +1136,10 @@ static int user_event_parse(char *name, char *args, char *flags,
 		goto put_user;
 
 	user->index = index;
+
+	/* Ensure we track ref */
+	atomic_inc(&user->refcnt);
+
 	dyn_event_init(&user->devent, &user_event_dops);
 	dyn_event_add(&user->devent, &user->call);
 	set_bit(user->index, page_bitmap);
@@ -1147,12 +1166,21 @@ static int delete_user_event(char *name)
 	if (!user)
 		return -ENOENT;
 
-	if (atomic_read(&user->refcnt) != 0)
-		return -EBUSY;
+	/* Ensure we are the last ref */
+	if (atomic_read(&user->refcnt) != 1) {
+		ret = -EBUSY;
+		goto put_ref;
+	}
 
-	mutex_lock(&event_mutex);
 	ret = destroy_user_event(user);
-	mutex_unlock(&event_mutex);
+
+	if (ret)
+		goto put_ref;
+
+	return ret;
+put_ref:
+	/* No longer have this ref */
+	atomic_dec(&user->refcnt);
 
 	return ret;
 }
@@ -1340,6 +1368,9 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 
 	ret = user_events_ref_add(file, user);
 
+	/* No longer need parse ref, ref_add either worked or not */
+	atomic_dec(&user->refcnt);
+
 	/* Positive number is index and valid */
 	if (ret < 0)
 		return ret;
@@ -1364,7 +1395,10 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
+	/* event_mutex prevents dyn_event from racing */
+	mutex_lock(&event_mutex);
 	ret = delete_user_event(name);
+	mutex_unlock(&event_mutex);
 
 	kfree(name);
 
-- 
cgit v1.2.3


From 9f438d4d7fa2f59570cd58bca24ac82e7172d63b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Thu, 10 Mar 2022 11:17:07 -0600
Subject: tracing: Fix strncpy warning in trace_events_synth.c

0-day reported the strncpy error below:

../kernel/trace/trace_events_synth.c: In function 'last_cmd_set':
../kernel/trace/trace_events_synth.c:65:9: warning: 'strncpy' specified bound depends on the length o\
f the source argument [-Wstringop-truncation]
   65 |         strncpy(last_cmd, str, strlen(str) + 1);
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../kernel/trace/trace_events_synth.c:65:32: note: length computed here
   65 |         strncpy(last_cmd, str, strlen(str) + 1);
      |                                ^~~~~~~~~~~

There's no reason to use strncpy here, in fact there's no reason to do
anything but a simple kstrdup() (note we don't even need to check for
failure since last_cmod is expected to be either the last cmd string
or NULL, and the containing function is a void return).

Link: https://lkml.kernel.org/r/77deca8cbfd226981b3f1eab203967381e9b5bd9.camel@kernel.org

Fixes: 27c888da9867 ("tracing: Remove size restriction on synthetic event cmd error logging")

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index fdd79e07e2fc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -58,11 +58,8 @@ static void last_cmd_set(const char *str)
 		return;
 
 	kfree(last_cmd);
-	last_cmd = kzalloc(strlen(str) + 1, GFP_KERNEL);
-	if (!last_cmd)
-		return;
 
-	strncpy(last_cmd, str, strlen(str) + 1);
+	last_cmd = kstrdup(str, GFP_KERNEL);
 }
 
 static void synth_err(u8 err_type, u16 err_pos)
-- 
cgit v1.2.3


From b3bc8547d3be60898818885f5bf22d0a62e2eb48 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 10 Mar 2022 23:27:38 -0500
Subject: tracing: Have TRACE_DEFINE_ENUM affect trace event types as well

The macro TRACE_DEFINE_ENUM is used to convert enums in the kernel to
their actual value when they are exported to user space via the trace
event format file.

Currently only the enums in the "print fmt" (TP_printk in the TRACE_EVENT
macro) have the enums converted. But the enums can be used to denote array
size:

        field:unsigned int fc_ineligible_rc[EXT4_FC_REASON_MAX]; offset:12;      size:36;        signed:0;

The EXT4_FC_REASON_MAX has no meaning to userspace but it needs to know
that information to know how to parse the array.

Have the array indexes also be parsed as well.

Link: https://lore.kernel.org/all/cover.1646922487.git.riteshh@linux.ibm.com/

Reported-by: Ritesh Harjani <riteshh@linux.ibm.com>
Tested-by: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 38afd66d80e3..ae9a3b8481f5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2633,6 +2633,33 @@ static void update_event_printk(struct trace_event_call *call,
 	}
 }
 
+static void update_event_fields(struct trace_event_call *call,
+				struct trace_eval_map *map)
+{
+	struct ftrace_event_field *field;
+	struct list_head *head;
+	char *ptr;
+	int len = strlen(map->eval_string);
+
+	head = trace_get_fields(call);
+	list_for_each_entry(field, head, link) {
+		ptr = strchr(field->type, '[');
+		if (!ptr)
+			continue;
+		ptr++;
+
+		if (!isalpha(*ptr) && *ptr != '_')
+			continue;
+
+		if (strncmp(map->eval_string, ptr, len) != 0)
+			continue;
+
+		ptr = eval_replace(ptr, map, len);
+		/* enum/sizeof string smaller than value */
+		WARN_ON_ONCE(!ptr);
+	}
+}
+
 void trace_event_eval_update(struct trace_eval_map **map, int len)
 {
 	struct trace_event_call *call, *p;
@@ -2668,6 +2695,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 					first = false;
 				}
 				update_event_printk(call, map[i]);
+				update_event_fields(call, map[i]);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 380af29b8d7670c445965bd573ab219aff0c4c11 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 10 Mar 2022 21:37:09 -0500
Subject: tracing: Add snapshot at end of kernel boot up

Add ftrace_boot_snapshot kernel parameter that will take a snapshot at the
end of boot up just before switching over to user space (it happens during
the kernel freeing of init memory).

This is useful when there's interesting data that can be collected from
kernel start up, but gets overridden by user space start up code. With
this option, the ring buffer content from the boot up traces gets saved in
the snapshot at the end of boot up. This trace can be read from:

 /sys/kernel/tracing/snapshot

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c |  2 ++
 kernel/trace/trace.c  | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9feb197b2da..4e29bd1cf151 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7096,6 +7096,8 @@ void __init ftrace_free_init_mem(void)
 	void *start = (void *)(&__init_begin);
 	void *end = (void *)(&__init_end);
 
+	ftrace_boot_snapshot();
+
 	ftrace_free_mem(NULL, start, end);
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7c85ce9ffdc3..eaf7d30ca6f1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
 
 static bool allocate_snapshot;
+static bool snapshot_at_boot;
 
 static int __init set_cmdline_ftrace(char *str)
 {
@@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str)
 __setup("alloc_snapshot", boot_alloc_snapshot);
 
 
+static int __init boot_snapshot(char *str)
+{
+	snapshot_at_boot = true;
+	boot_alloc_snapshot(str);
+	return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
 static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
 
 static int __init set_trace_boot_options(char *str)
@@ -10149,6 +10159,14 @@ out:
 	return ret;
 }
 
+void __init ftrace_boot_snapshot(void)
+{
+	if (snapshot_at_boot) {
+		tracing_snapshot();
+		internal_trace_puts("** Boot snapshot taken **\n");
+	}
+}
+
 void __init early_trace_init(void)
 {
 	if (tracepoint_printk) {
-- 
cgit v1.2.3


From bc47ee4844d6b7d7351536cd99d35848c4449689 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 11 Mar 2022 18:59:57 -0500
Subject: tracing/user_events: Use alloc_pages instead of kzalloc() for
 register pages

kzalloc virtual addresses do not work with SetPageReserved, use the actual
page virtual addresses instead via alloc_pages.

The issue is reported when booting with user_events and
DEBUG_VM_PGFLAGS=y.

Also make the number of events based on the ORDER.

Link: https://lore.kernel.org/all/CADYN=9+xY5Vku3Ws5E9S60SM5dCFfeGeRBkmDFbcxX0ZMoFing@mail.gmail.com/
Link: https://lore.kernel.org/all/20220311223028.1865-1-beaub@linux.microsoft.com/

Cc: Beau Belgrave <beaub@linux.microsoft.com>
Reported-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 4febc1d6ae72..e10ad057e797 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -30,9 +30,10 @@
 
 /*
  * Limits how many trace_event calls user processes can create:
- * Must be multiple of PAGE_SIZE.
+ * Must be a power of two of PAGE_SIZE.
  */
-#define MAX_PAGES 1
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
 #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
 
 /* Limit how long of an event name plus args within the subsystem. */
@@ -1622,16 +1623,17 @@ static void set_page_reservations(bool set)
 
 static int __init trace_events_user_init(void)
 {
+	struct page *pages;
 	int ret;
 
 	/* Zero all bits beside 0 (which is reserved for failures) */
 	bitmap_zero(page_bitmap, MAX_EVENTS);
 	set_bit(0, page_bitmap);
 
-	register_page_data = kzalloc(MAX_EVENTS, GFP_KERNEL);
-
-	if (!register_page_data)
+	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+	if (!pages)
 		return -ENOMEM;
+	register_page_data = page_address(pages);
 
 	set_page_reservations(true);
 
@@ -1640,7 +1642,7 @@ static int __init trace_events_user_init(void)
 	if (ret) {
 		pr_warn("user_events could not register with tracefs\n");
 		set_page_reservations(false);
-		kfree(register_page_data);
+		__free_pages(pages, MAX_PAGE_ORDER);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From f9da322e864e5cd3dc217480e73f78f47cf40c5b Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 5 Mar 2022 12:46:57 -0800
Subject: cgroup: cleanup comments

for spdx, add a space before //

replacements
judgement to judgment
transofrmed to transformed
partitition to partition
histrical to historical
migratecd to migrated

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c  | 10 +++++-----
 kernel/cgroup/freezer.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dc653ab26e50..8324a8fd71bb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /*
  * There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
  * of the situation.
  */
 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -1151,7 +1151,7 @@ enum subparts_cmd {
  * effective_cpus. The function will return 0 if all the CPUs listed in
  * cpus_allowed can be granted or an error code will be returned.
  *
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
  * root back to a non-partition root. Any CPUs in cpus_allowed that are in
  * parent's subparts_cpus will be taken away from that cpumask and put back
  * into parent's effective_cpus. 0 should always be returned.
@@ -1990,7 +1990,7 @@ out:
 }
 
 /*
- * update_prstate - update partititon_root_state
+ * update_prstate - update partition_root_state
  * cs: the cpuset to update
  * new_prs: new partition root state
  *
@@ -2840,7 +2840,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	/*
 	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
 	 * set.  This flag handling is implemented in cgroup core for
-	 * histrical reasons - the flag may be specified during mount.
+	 * historical reasons - the flag may be specified during mount.
 	 *
 	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
 	 * refuse to clone the configuration - thereby refusing the task to
@@ -3037,7 +3037,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 
 	/*
 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
-	 * as the tasks will be migratecd to an ancestor.
+	 * as the tasks will be migrated to an ancestor.
 	 */
 	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
 		update_tasks_cpumask(cs);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/cgroup.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
-- 
cgit v1.2.3


From aebfd12521d9c7d0b502cf6d06314cfbcdccfe3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Mar 2022 16:30:29 +0100
Subject: x86/ibt,ftrace: Search for __fentry__ location

Currently a lot of ftrace code assumes __fentry__ is at sym+0. However
with Intel IBT enabled the first instruction of a function will most
likely be ENDBR.

Change ftrace_location() to not only return the __fentry__ location
when called for the __fentry__ location, but also when called for the
sym+0 location.

Then audit/update all callsites of this function to consistently use
these new semantics.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220308154318.227581603@infradead.org
---
 kernel/bpf/trampoline.c | 20 ++++----------------
 kernel/kprobes.c        |  8 ++------
 kernel/trace/ftrace.c   | 48 ++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 46 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5e7edf913060..455f1b4f312e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -117,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
 	tr->mod = NULL;
 }
 
-static int is_ftrace_location(void *ip)
-{
-	long addr;
-
-	addr = ftrace_location((long)ip);
-	if (!addr)
-		return 0;
-	if (WARN_ON_ONCE(addr != (long)ip))
-		return -EFAULT;
-	return 1;
-}
-
 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 {
 	void *ip = tr->func.addr;
@@ -160,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
 static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 {
 	void *ip = tr->func.addr;
+	unsigned long faddr;
 	int ret;
 
-	ret = is_ftrace_location(ip);
-	if (ret < 0)
-		return ret;
-	tr->func.ftrace_managed = ret;
+	faddr = ftrace_location((unsigned long)ip);
+	if (faddr)
+		tr->func.ftrace_managed = true;
 
 	if (bpf_trampoline_module_get(tr))
 		return -ENOENT;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 94cab8c9ce56..6d1e11cda4f1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1562,14 +1562,10 @@ static inline int warn_kprobe_rereg(struct kprobe *p)
 
 static int check_ftrace_location(struct kprobe *p)
 {
-	unsigned long ftrace_addr;
+	unsigned long addr = (unsigned long)p->addr;
 
-	ftrace_addr = ftrace_location((unsigned long)p->addr);
-	if (ftrace_addr) {
+	if (ftrace_location(addr) == addr) {
 #ifdef CONFIG_KPROBES_ON_FTRACE
-		/* Given address is not on the instruction boundary */
-		if ((unsigned long)p->addr != ftrace_addr)
-			return -EILSEQ;
 		p->flags |= KPROBE_FLAG_FTRACE;
 #else	/* !CONFIG_KPROBES_ON_FTRACE */
 		return -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6105b7036482..2ae6fb9a5b12 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1568,17 +1568,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
 }
 
 /**
- * ftrace_location - return true if the ip giving is a traced location
+ * ftrace_location - return the ftrace location
  * @ip: the instruction pointer to check
  *
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
+ * If @ip matches the ftrace location, return @ip.
+ * If @ip matches sym+0, return sym's ftrace location.
+ * Otherwise, return 0.
  */
 unsigned long ftrace_location(unsigned long ip)
 {
-	return ftrace_location_range(ip, ip);
+	struct dyn_ftrace *rec;
+	unsigned long offset;
+	unsigned long size;
+
+	rec = lookup_rec(ip, ip);
+	if (!rec) {
+		if (!kallsyms_lookup_size_offset(ip, &size, &offset))
+			goto out;
+
+		/* map sym+0 to __fentry__ */
+		if (!offset)
+			rec = lookup_rec(ip, ip + size - 1);
+	}
+
+	if (rec)
+		return rec->ip;
+
+out:
+	return 0;
 }
 
 /**
@@ -4962,7 +4979,8 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 {
 	struct ftrace_func_entry *entry;
 
-	if (!ftrace_location(ip))
+	ip = ftrace_location(ip);
+	if (!ip)
 		return -EINVAL;
 
 	if (remove) {
@@ -5110,11 +5128,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 	struct ftrace_func_entry *entry;
 	struct ftrace_hash *free_hash = NULL;
 	struct dyn_ftrace *rec;
-	int ret = -EBUSY;
+	int ret = -ENODEV;
 
 	mutex_lock(&direct_mutex);
 
+	ip = ftrace_location(ip);
+	if (!ip)
+		goto out_unlock;
+
 	/* See if there's a direct function at @ip already */
+	ret = -EBUSY;
 	if (ftrace_find_rec_direct(ip))
 		goto out_unlock;
 
@@ -5222,6 +5245,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 
 	mutex_lock(&direct_mutex);
 
+	ip = ftrace_location(ip);
+	if (!ip)
+		goto out_unlock;
+
 	entry = find_direct_entry(&ip, NULL);
 	if (!entry)
 		goto out_unlock;
@@ -5354,6 +5381,11 @@ int modify_ftrace_direct(unsigned long ip,
 	mutex_lock(&direct_mutex);
 
 	mutex_lock(&ftrace_lock);
+
+	ip = ftrace_location(ip);
+	if (!ip)
+		goto out_unlock;
+
 	entry = find_direct_entry(&ip, &rec);
 	if (!entry)
 		goto out_unlock;
-- 
cgit v1.2.3


From d15cb3dab1e4f00e29599a4f5e1f6678a530d270 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Mar 2022 16:30:30 +0100
Subject: x86/livepatch: Validate __fentry__ location

Currently livepatch assumes __fentry__ lives at func+0, which is most
likely untrue with IBT on. Instead make it use ftrace_location() by
default which both validates and finds the actual ip if there is any
in the same symbol.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220308154318.285971256@infradead.org
---
 kernel/livepatch/patch.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index fe316c021d73..c172bf92b576 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -124,19 +124,6 @@ unlock:
 	ftrace_test_recursion_unlock(bit);
 }
 
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
-	return faddr;
-}
-#endif
-
 static void klp_unpatch_func(struct klp_func *func)
 {
 	struct klp_ops *ops;
@@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
 	if (list_is_singular(&ops->func_stack)) {
 		unsigned long ftrace_loc;
 
-		ftrace_loc =
-			klp_get_ftrace_location((unsigned long)func->old_func);
+		ftrace_loc = ftrace_location((unsigned long)func->old_func);
 		if (WARN_ON(!ftrace_loc))
 			return;
 
@@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
 	if (!ops) {
 		unsigned long ftrace_loc;
 
-		ftrace_loc =
-			klp_get_ftrace_location((unsigned long)func->old_func);
+		ftrace_loc = ftrace_location((unsigned long)func->old_func);
 		if (!ftrace_loc) {
 			pr_err("failed to find location for function '%s'\n",
 				func->old_name);
-- 
cgit v1.2.3


From cc66bb91457827f62e2b6cb2518666820f0a6c48 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Mar 2022 16:30:32 +0100
Subject: x86/ibt,kprobes: Cure sym+0 equals fentry woes

In order to allow kprobes to skip the ENDBR instructions at sym+0 for
X86_KERNEL_IBT builds, change _kprobe_addr() to take an architecture
callback to inspect the function at hand and modify the offset if
needed.

This streamlines the existing interface to cover more cases and
require less hooks. Once PowerPC gets fully converted there will only
be the one arch hook.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220308154318.405947704@infradead.org
---
 kernel/kprobes.c | 66 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6d1e11cda4f1..185badc780b7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1488,25 +1488,69 @@ bool within_kprobe_blacklist(unsigned long addr)
 	return false;
 }
 
+/*
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+						unsigned long offset,
+						bool *on_func_entry)
+{
+	*on_func_entry = !offset;
+	return (kprobe_opcode_t *)(addr + offset);
+}
+
 /*
  * If 'symbol_name' is specified, look it up and add the 'offset'
  * to it. This way, we can specify a relative address to a symbol.
  * This returns encoded errors if it fails to look up symbol or invalid
  * combination of parameters.
  */
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
-			const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+	     unsigned long offset, bool *on_func_entry)
 {
 	if ((symbol_name && addr) || (!symbol_name && !addr))
 		goto invalid;
 
 	if (symbol_name) {
+		/*
+		 * Input: @sym + @offset
+		 * Output: @addr + @offset
+		 *
+		 * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+		 *       argument into it's output!
+		 */
 		addr = kprobe_lookup_name(symbol_name, offset);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
 
-	addr = (kprobe_opcode_t *)(((char *)addr) + offset);
+	/*
+	 * So here we have @addr + @offset, displace it into a new
+	 * @addr' + @offset' where @addr' is the symbol start address.
+	 */
+	addr = (void *)addr + offset;
+	if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+		return ERR_PTR(-ENOENT);
+	addr = (void *)addr - offset;
+
+	/*
+	 * Then ask the architecture to re-combine them, taking care of
+	 * magical function entry details while telling us if this was indeed
+	 * at the start of the function.
+	 */
+	addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
 	if (addr)
 		return addr;
 
@@ -1516,7 +1560,8 @@ invalid:
 
 static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
 {
-	return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+	bool on_func_entry;
+	return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
 }
 
 /*
@@ -2043,11 +2088,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
-{
-	return !offset;
-}
-
 /**
  * kprobe_on_func_entry() -- check whether given address is function entry
  * @addr: Target address
@@ -2063,15 +2103,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
  */
 int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
 {
-	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+	bool on_func_entry;
+	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
 
 	if (IS_ERR(kp_addr))
 		return PTR_ERR(kp_addr);
 
-	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
-		return -ENOENT;
-
-	if (!arch_kprobe_on_func_entry(offset))
+	if (!on_func_entry)
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From eae654f1c21216daa9fbb92591c0d9f5ae46cfc5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Mar 2022 16:30:48 +0100
Subject: exit: Mark do_group_exit() __noreturn

vmlinux.o: warning: objtool: get_signal()+0x108: unreachable instruction

0000 000000000007f930 <get_signal>:
...
0103    7fa33:  e8 00 00 00 00          call   7fa38 <get_signal+0x108> 7fa34: R_X86_64_PLT32   do_group_exit-0x4
0108    7fa38:  41 8b 45 74             mov    0x74(%r13),%eax

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lore.kernel.org/r/20220308154319.351270711@infradead.org
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..b71f9df9074e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -906,7 +906,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
-void
+void __noreturn
 do_group_exit(int exit_code)
 {
 	struct signal_struct *sig = current->signal;
-- 
cgit v1.2.3


From 105cd68596392cfe15056a891b0723609dcad247 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 14 Mar 2022 17:58:35 +0100
Subject: x86: Mark __invalid_creds() __noreturn

vmlinux.o: warning: objtool: ksys_unshare()+0x36c: unreachable instruction

0000 0000000000067040 <ksys_unshare>:
...
0364    673a4:	4c 89 ef             	mov    %r13,%rdi
0367    673a7:	e8 00 00 00 00       	call   673ac <ksys_unshare+0x36c>	673a8: R_X86_64_PLT32	__invalid_creds-0x4
036c    673ac:	e9 28 ff ff ff       	jmp    672d9 <ksys_unshare+0x299>
0371    673b1:	41 bc f4 ff ff ff    	mov    $0xfffffff4,%r12d
0377    673b7:	e9 80 fd ff ff       	jmp    6713c <ksys_unshare+0xfc>

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/Yi9gOW9f1GGwwUD6@hirez.programming.kicks-ass.net
---
 kernel/cred.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 933155c96922..e10c15f51c1f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -870,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
 /*
  * report use of invalid credentials
  */
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
 {
 	printk(KERN_ERR "CRED: Invalid credentials\n");
 	printk(KERN_ERR "CRED: At %s:%u\n", file, line);
-- 
cgit v1.2.3


From a7b2553b5ece1aba4b5994eef150d0a1269b5805 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 15 Mar 2022 10:33:53 +0100
Subject: sched/headers: Only include <linux/entry-common.h> when
 CONFIG_GENERIC_ENTRY=y

This header is not (yet) standalone.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aa9e14ce94fb..bf443f470f28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -63,7 +63,9 @@
 #include <linux/workqueue_api.h>
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
-# include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_ENTRY
+#  include <linux/entry-common.h>
+# endif
 #endif
 
 #include <uapi/linux/sched/types.h>
-- 
cgit v1.2.3


From 089331d47325d3f55016ed3d1f79ffd2754fb19e Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 8 Mar 2022 14:28:07 -0800
Subject: user_events: Add trace event call as root for low permission cases

Tracefs by default is locked down heavily. System operators can open up
some files, such as user_events to a broader set of users. These users
do not have access within tracefs beyond just the user_event files. Due
to this restriction the trace_add_event_call/remove calls will silently
fail since the caller does not have permissions to create directories.

To fix this trace_add_event_call/remove calls will be issued with
override creds of the global root UID. Creds are reverted immediately
afterward.

Link: https://lkml.kernel.org/r/20220308222807.2040-1-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e10ad057e797..8b3d241a31c2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -562,6 +562,41 @@ static struct trace_event_functions user_event_funcs = {
 	.trace = user_event_print_trace,
 };
 
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+	int ret;
+	const struct cred *old_cred;
+	struct cred *cred;
+
+	cred = prepare_creds();
+
+	if (!cred)
+		return -ENOMEM;
+
+	/*
+	 * While by default tracefs is locked down, systems can be configured
+	 * to allow user_event files to be less locked down. The extreme case
+	 * being "other" has read/write access to user_events_data/status.
+	 *
+	 * When not locked down, processes may not have have permissions to
+	 * add/remove calls themselves to tracefs. We need to temporarily
+	 * switch to root file permission to allow for this scenario.
+	 */
+	cred->fsuid = GLOBAL_ROOT_UID;
+
+	old_cred = override_creds(cred);
+
+	if (visible)
+		ret = trace_add_event_call(&user->call);
+	else
+		ret = trace_remove_event_call(&user->call);
+
+	revert_creds(old_cred);
+	put_cred(cred);
+
+	return ret;
+}
+
 static int destroy_user_event(struct user_event *user)
 {
 	int ret = 0;
@@ -569,7 +604,7 @@ static int destroy_user_event(struct user_event *user)
 	/* Must destroy fields before call removal */
 	user_event_destroy_fields(user);
 
-	ret = trace_remove_event_call(&user->call);
+	ret = user_event_set_call_visible(user, false);
 
 	if (ret)
 		return ret;
@@ -1049,7 +1084,7 @@ static int user_event_trace_register(struct user_event *user)
 	if (!ret)
 		return -ENODEV;
 
-	ret = trace_add_event_call(&user->call);
+	ret = user_event_set_call_visible(user, true);
 
 	if (ret)
 		unregister_trace_event(&user->call.event);
-- 
cgit v1.2.3


From d2a3b7c5becc3992f8e7d2b9bf5eacceeedb9a48 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 9 Mar 2022 20:33:20 +0800
Subject: bpf: Fix net.core.bpf_jit_harden race

It is the bpf_jit_harden counterpart to commit 60b58afc96c9 ("bpf: fix
net.core.bpf_jit_enable race"). bpf_jit_harden will be tested twice
for each subprog if there are subprogs in bpf program and constant
blinding may increase the length of program, so when running
"./test_progs -t subprogs" and toggling bpf_jit_harden between 0 and 2,
jit_subprogs may fail because constant blinding increases the length
of subprog instructions during extra passs.

So cache the value of bpf_jit_blinding_enabled() during program
allocation, and use the cached value during constant blinding, subprog
JITing and args tracking of tail call.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220309123321.2400262-4-houtao1@huawei.com
---
 kernel/bpf/core.c     | 3 ++-
 kernel/bpf/verifier.c | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ab630f773ec1..1324f9523e7c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,6 +105,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->aux = aux;
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
+	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
 
 	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
 	mutex_init(&fp->aux->used_maps_mutex);
@@ -1382,7 +1383,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 	struct bpf_insn *insn;
 	int i, rewritten;
 
-	if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+	if (!prog->blinding_requested || prog->blinded)
 		return prog;
 
 	clone = bpf_prog_clone_create(prog, GFP_USER);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0db6cd8dcb35..cf92f9c01556 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13023,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->blinding_requested = prog->blinding_requested;
 		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
 		func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
 		func[i]->aux->linfo = prog->aux->linfo;
@@ -13146,6 +13147,7 @@ out_free:
 out_undo_insn:
 	/* cleanup main prog to be interpreted */
 	prog->jit_requested = 0;
+	prog->blinding_requested = 0;
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 		if (!bpf_pseudo_call(insn))
 			continue;
@@ -13239,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
 	enum bpf_attach_type eatype = prog->expected_attach_type;
-	bool expect_blinding = bpf_jit_blinding_enabled(prog);
 	enum bpf_prog_type prog_type = resolve_prog_type(prog);
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
@@ -13403,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
 			aux = &env->insn_aux_data[i + delta];
-			if (env->bpf_capable && !expect_blinding &&
+			if (env->bpf_capable && !prog->blinding_requested &&
 			    prog->jit_requested &&
 			    !bpf_map_key_poisoned(aux) &&
 			    !bpf_map_ptr_poisoned(aux) &&
-- 
cgit v1.2.3


From 2957308343fa7c621df9f342fab88cb970b8d5f3 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sat, 12 Mar 2022 23:22:20 +0800
Subject: livepatch: Don't block removal of patches that are safe to unload

module_put() is not called for a patch with "forced" flag. It should
block the removal of the livepatch module when the code might still
be in use after forced transition.

klp_force_transition() currently sets "forced" flag for all patches on
the list.

In fact, any patch can be safely unloaded when it passed through
the consistency model in KLP_UNPATCHED transition.

In other words, the "forced" flag must be set only for livepatches
that are being removed. In particular, set the "forced" flag:

  + only for klp_transition_patch when the transition to KLP_UNPATCHED
    state was forced.

  + all replaced patches when the transition to KLP_PATCHED state was
    forced and the patch was replacing the existing patches.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
[mbenes@suse.cz: wording improvements]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220312152220.88127-1-zhouchengming@bytedance.com
---
 kernel/livepatch/transition.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..77ef45a1e0a3 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -641,6 +641,13 @@ void klp_force_transition(void)
 	for_each_possible_cpu(cpu)
 		klp_update_patch_state(idle_task(cpu));
 
-	klp_for_each_patch(patch)
-		patch->forced = true;
+	/* Set forced flag for patches being removed. */
+	if (klp_target_state == KLP_UNPATCHED)
+		klp_transition_patch->forced = true;
+	else if (klp_transition_patch->replace) {
+		klp_for_each_patch(patch) {
+			if (patch != klp_transition_patch)
+				patch->forced = true;
+		}
+	}
 }
-- 
cgit v1.2.3


From 4f554e955614f19425cee86de4669351741a6280 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 15 Mar 2022 23:00:26 +0900
Subject: ftrace: Add ftrace_set_filter_ips function

Adding ftrace_set_filter_ips function to be able to set filter on
multiple ip addresses at once.

With the kprobe multi attach interface we have cases where we need to
initialize ftrace_ops object with thousands of functions, so having
single function diving into ftrace_hash_move_and_update_ops with
ftrace_lock is faster.

The functions ips are passed as unsigned long array with count.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164735282673.1084943.18310504594134769804.stgit@devnote2
---
 kernel/trace/ftrace.c | 58 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a4b462b6f944..93e992962ada 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4958,7 +4958,7 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
 }
 
 static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 {
 	struct ftrace_func_entry *entry;
 
@@ -4976,9 +4976,30 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 	return add_hash_entry(hash, ip);
 }
 
+static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+		  unsigned int cnt, int remove)
+{
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < cnt; i++) {
+		err = __ftrace_match_addr(hash, ips[i], remove);
+		if (err) {
+			/*
+			 * This expects the @hash is a temporary hash and if this
+			 * fails the caller must free the @hash.
+			 */
+			return err;
+		}
+	}
+	return 0;
+}
+
 static int
 ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
-		unsigned long ip, int remove, int reset, int enable)
+		unsigned long *ips, unsigned int cnt,
+		int remove, int reset, int enable)
 {
 	struct ftrace_hash **orig_hash;
 	struct ftrace_hash *hash;
@@ -5008,8 +5029,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 		ret = -EINVAL;
 		goto out_regex_unlock;
 	}
-	if (ip) {
-		ret = ftrace_match_addr(hash, ip, remove);
+	if (ips) {
+		ret = ftrace_match_addr(hash, ips, cnt, remove);
 		if (ret < 0)
 			goto out_regex_unlock;
 	}
@@ -5026,10 +5047,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 }
 
 static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
-		int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+		int remove, int reset, int enable)
 {
-	return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+	return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5634,10 +5655,29 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset)
 {
 	ftrace_ops_init(ops);
-	return ftrace_set_addr(ops, ip, remove, reset, 1);
+	return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
 
+/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+			  unsigned int cnt, int remove, int reset)
+{
+	ftrace_ops_init(ops);
+	return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
 /**
  * ftrace_ops_set_global_filter - setup ops to use global filters
  * @ops - the ops which will use the global filters
@@ -5659,7 +5699,7 @@ static int
 ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 		 int reset, int enable)
 {
-	return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+	return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
 }
 
 /**
-- 
cgit v1.2.3


From cad9931f64dc7f5dbdec12cae9f30063360f9855 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 15 Mar 2022 23:00:38 +0900
Subject: fprobe: Add ftrace based probe APIs

The fprobe is a wrapper API for ftrace function tracer.
Unlike kprobes, this probes only supports the function entry, but this
can probe multiple functions by one fprobe. The usage is similar, user
will set their callback to fprobe::entry_handler and call
register_fprobe*() with probed functions.
There are 3 registration interfaces,

 - register_fprobe() takes filtering patterns of the functin names.
 - register_fprobe_ips() takes an array of ftrace-location addresses.
 - register_fprobe_syms() takes an array of function names.

The registered fprobes can be unregistered with unregister_fprobe().
e.g.

struct fprobe fp = { .entry_handler = user_handler };
const char *targets[] = { "func1", "func2", "func3"};
...

ret = register_fprobe_syms(&fp, targets, ARRAY_SIZE(targets));

...

unregister_fprobe(&fp);

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164735283857.1084943.1154436951479395551.stgit@devnote2
---
 kernel/trace/Kconfig  |  12 +++
 kernel/trace/Makefile |   1 +
 kernel/trace/fprobe.c | 211 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 kernel/trace/fprobe.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..7ce31abc542b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -236,6 +236,18 @@ config DYNAMIC_FTRACE_WITH_ARGS
 	depends on DYNAMIC_FTRACE
 	depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
 
+config FPROBE
+	bool "Kernel Function Probe (fprobe)"
+	depends on FUNCTION_TRACER
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	  This option enables kernel function probe (fprobe) based on ftrace,
+	  which is similar to kprobes, but probes only for kernel function
+	  entries and it can probe multiple functions by one fprobe.
+
+	  If unsure, say N.
+
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..79255f9de9a4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -97,6 +97,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
 obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..7e8ceee339a0
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+	struct fprobe *fp;
+	int bit;
+
+	fp = container_of(ops, struct fprobe, ops);
+	if (fprobe_disabled(fp))
+		return;
+
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
+	if (bit < 0) {
+		fp->nmissed++;
+		return;
+	}
+
+	if (fp->entry_handler)
+		fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+	ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+	unsigned long addr, size;
+	unsigned long *addrs;
+	int i;
+
+	/* Convert symbols to symbol address */
+	addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+	if (!addrs)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < num; i++) {
+		addr = kallsyms_lookup_name(syms[i]);
+		if (!addr)	/* Maybe wrong symbol */
+			goto error;
+
+		/* Convert symbol address to ftrace location. */
+		if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
+			goto error;
+
+		addr = ftrace_location_range(addr, addr + size - 1);
+		if (!addr) /* No dynamic ftrace there. */
+			goto error;
+
+		addrs[i] = addr;
+	}
+
+	return addrs;
+
+error:
+	kfree(addrs);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+	fp->nmissed = 0;
+	fp->ops.func = fprobe_handler;
+	fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+	unsigned char *str;
+	int ret, len;
+
+	if (!fp || !filter)
+		return -EINVAL;
+
+	fprobe_init(fp);
+
+	len = strlen(filter);
+	str = kstrdup(filter, GFP_KERNEL);
+	ret = ftrace_set_filter(&fp->ops, str, len, 0);
+	kfree(str);
+	if (ret)
+		return ret;
+
+	if (notfilter) {
+		len = strlen(notfilter);
+		str = kstrdup(notfilter, GFP_KERNEL);
+		ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+		kfree(str);
+		if (ret)
+			goto out;
+	}
+
+	ret = register_ftrace_function(&fp->ops);
+out:
+	if (ret)
+		ftrace_free_filter(&fp->ops);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+	int ret;
+
+	if (!fp || !addrs || num <= 0)
+		return -EINVAL;
+
+	fprobe_init(fp);
+
+	ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+	if (!ret)
+		ret = register_ftrace_function(&fp->ops);
+
+	if (ret)
+		ftrace_free_filter(&fp->ops);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+	unsigned long *addrs;
+	int ret;
+
+	if (!fp || !syms || num <= 0)
+		return -EINVAL;
+
+	addrs = get_ftrace_locations(syms, num);
+	if (IS_ERR(addrs))
+		return PTR_ERR(addrs);
+
+	ret = register_fprobe_ips(fp, addrs, num);
+
+	kfree(addrs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+	int ret;
+
+	if (!fp || fp->ops.func != fprobe_handler)
+		return -EINVAL;
+
+	ret = unregister_ftrace_function(&fp->ops);
+
+	if (!ret)
+		ftrace_free_filter(&fp->ops);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
-- 
cgit v1.2.3


From 54ecbe6f1ed5138c895bdff55608cf502755b20e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 15 Mar 2022 23:00:50 +0900
Subject: rethook: Add a generic return hook

Add a return hook framework which hooks the function return. Most of the
logic came from the kretprobe, but this is independent from kretprobe.

Note that this is expected to be used with other function entry hooking
feature, like ftrace, fprobe, adn kprobes. Eventually this will replace
the kretprobe (e.g. kprobe + rethook = kretprobe), but at this moment,
this is just an additional hook.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164735285066.1084943.9259661137330166643.stgit@devnote2
---
 kernel/exit.c          |   2 +
 kernel/fork.c          |   3 +
 kernel/trace/Kconfig   |  11 ++
 kernel/trace/Makefile  |   1 +
 kernel/trace/rethook.c | 317 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 334 insertions(+)
 create mode 100644 kernel/trace/rethook.c

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..2d1803fa8fe6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,6 +64,7 @@
 #include <linux/compat.h>
 #include <linux/io_uring.h>
 #include <linux/kprobes.h>
+#include <linux/rethook.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 	kprobe_flush_task(tsk);
+	rethook_flush_task(tsk);
 	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index a024bf6254df..3db1a4110a25 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2255,6 +2255,9 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef CONFIG_KRETPROBES
 	p->kretprobe_instances.first = NULL;
 #endif
+#ifdef CONFIG_RETHOOK
+	p->rethooks.first = NULL;
+#endif
 
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7ce31abc542b..e75504e42ab8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_RETHOOK
+	bool
+
+config RETHOOK
+	bool
+	depends on HAVE_RETHOOK
+	help
+	  Enable generic return hooking feature. This is an internal
+	  API, which will be used by other function-entry hooking
+	  features like fprobe and kprobes.
+
 config HAVE_FUNCTION_TRACER
 	bool
 	help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 79255f9de9a4..c6f11a139eac 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
 obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
 obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..ab463a4d2b23
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+	struct rethook_node *rhn;
+	struct llist_node *node;
+
+	node = __llist_del_all(&tk->rethooks);
+	while (node) {
+		rhn = container_of(node, struct rethook_node, llist);
+		node = node->next;
+		preempt_disable();
+		rethook_recycle(rhn);
+		preempt_enable();
+	}
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+	struct rethook *rh = container_of(head, struct rethook, rcu);
+	struct rethook_node *rhn;
+	struct freelist_node *node;
+	int count = 1;
+
+	node = rh->pool.head;
+	while (node) {
+		rhn = container_of(node, struct rethook_node, freelist);
+		node = node->next;
+		kfree(rhn);
+		count++;
+	}
+
+	/* The rh->ref is the number of pooled node + 1 */
+	if (refcount_sub_and_test(count, &rh->ref))
+		kfree(rh);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+	rcu_assign_pointer(rh->handler, NULL);
+
+	call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function.
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return NULL if memory allocation fails or @handler is NULL.
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+	struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+	if (!rh || !handler)
+		return NULL;
+
+	rh->data = data;
+	rh->handler = handler;
+	rh->pool.head = NULL;
+	refcount_set(&rh->ref, 1);
+
+	return rh;
+}
+
+/**
+ * rethook_add_node() - Add a new node to the rethook.
+ * @rh: the struct rethook.
+ * @node: the struct rethook_node to be added.
+ *
+ * Add @node to @rh. User must allocate @node (as a part of user's
+ * data structure.) The @node fields are initialized in this function.
+ */
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+	node->rethook = rh;
+	freelist_add(&node->freelist, &rh->pool);
+	refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+	struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+	if (refcount_dec_and_test(&node->rethook->ref))
+		kfree(node->rethook);
+	kfree(node);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+	lockdep_assert_preemption_disabled();
+
+	if (likely(READ_ONCE(node->rethook->handler)))
+		freelist_add(&node->freelist, &node->rethook->pool);
+	else
+		call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+	rethook_handler_t handler = READ_ONCE(rh->handler);
+	struct freelist_node *fn;
+
+	lockdep_assert_preemption_disabled();
+
+	/* Check whether @rh is going to be freed. */
+	if (unlikely(!handler))
+		return NULL;
+
+	fn = freelist_try_get(&rh->pool);
+	if (!fn)
+		return NULL;
+
+	return container_of(fn, struct rethook_node, freelist);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+	arch_rethook_prepare(node, regs, mcount);
+	__llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+					     struct llist_node **cur)
+{
+	struct rethook_node *rh = NULL;
+	struct llist_node *node = *cur;
+
+	if (!node)
+		node = tsk->rethooks.first;
+	else
+		node = node->next;
+
+	while (node) {
+		rh = container_of(node, struct rethook_node, llist);
+		if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+			*cur = node;
+			return rh->ret_addr;
+		}
+		node = node->next;
+	}
+	return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+				    struct llist_node **cur)
+{
+	struct rethook_node *rhn = NULL;
+	unsigned long ret;
+
+	if (WARN_ON_ONCE(!cur))
+		return 0;
+
+	if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+		return 0;
+
+	do {
+		ret = __rethook_find_ret_addr(tsk, cur);
+		if (!ret)
+			break;
+		rhn = container_of(*cur, struct rethook_node, llist);
+	} while (rhn->frame != frame);
+
+	return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+				      unsigned long correct_ret_addr)
+{
+	/*
+	 * Do nothing by default. If the architecture which uses a
+	 * frame pointer to record real return address on the stack,
+	 * it should fill this function to fixup the return address
+	 * so that stacktrace works from the rethook handler.
+	 */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+					 unsigned long frame)
+{
+	struct llist_node *first, *node = NULL;
+	unsigned long correct_ret_addr;
+	rethook_handler_t handler;
+	struct rethook_node *rhn;
+
+	correct_ret_addr = __rethook_find_ret_addr(current, &node);
+	if (!correct_ret_addr) {
+		pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+		BUG_ON(1);
+	}
+
+	instruction_pointer_set(regs, correct_ret_addr);
+
+	/*
+	 * These loops must be protected from rethook_free_rcu() because those
+	 * are accessing 'rhn->rethook'.
+	 */
+	preempt_disable();
+
+	/*
+	 * Run the handler on the shadow stack. Do not unlink the list here because
+	 * stackdump inside the handlers needs to decode it.
+	 */
+	first = current->rethooks.first;
+	while (first) {
+		rhn = container_of(first, struct rethook_node, llist);
+		if (WARN_ON_ONCE(rhn->frame != frame))
+			break;
+		handler = READ_ONCE(rhn->rethook->handler);
+		if (handler)
+			handler(rhn, rhn->rethook->data, regs);
+
+		if (first == node)
+			break;
+		first = first->next;
+	}
+
+	/* Fixup registers for returning to correct address. */
+	arch_rethook_fixup_return(regs, correct_ret_addr);
+
+	/* Unlink used shadow stack */
+	first = current->rethooks.first;
+	current->rethooks.first = node->next;
+	node->next = NULL;
+
+	while (first) {
+		rhn = container_of(first, struct rethook_node, llist);
+		first = first->next;
+		rethook_recycle(rhn);
+	}
+	preempt_enable();
+
+	return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
-- 
cgit v1.2.3


From 5b0ab78998e32564a011b14c4c7f9c81e2d42b9d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 15 Mar 2022 23:01:48 +0900
Subject: fprobe: Add exit_handler support

Add exit_handler to fprobe. fprobe + rethook allows us to hook the kernel
function return. The rethook will be enabled only if the
fprobe::exit_handler is set.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164735290790.1084943.10601965782208052202.stgit@devnote2
---
 kernel/trace/Kconfig  |   9 ++--
 kernel/trace/fprobe.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 116 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e75504e42ab8..99dd4ca63d68 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -251,11 +251,14 @@ config FPROBE
 	bool "Kernel Function Probe (fprobe)"
 	depends on FUNCTION_TRACER
 	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on HAVE_RETHOOK
+	select RETHOOK
 	default n
 	help
-	  This option enables kernel function probe (fprobe) based on ftrace,
-	  which is similar to kprobes, but probes only for kernel function
-	  entries and it can probe multiple functions by one fprobe.
+	  This option enables kernel function probe (fprobe) based on ftrace.
+	  The fprobe is similar to kprobes, but probes only for kernel function
+	  entries and exits. This also can probe multiple functions by one
+	  fprobe.
 
 	  If unsure, say N.
 
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 7e8ceee339a0..38073632bfe4 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,12 +8,22 @@
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
+#include <linux/rethook.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 
+#include "trace.h"
+
+struct fprobe_rethook_node {
+	struct rethook_node node;
+	unsigned long entry_ip;
+};
+
 static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
+	struct fprobe_rethook_node *fpr;
+	struct rethook_node *rh;
 	struct fprobe *fp;
 	int bit;
 
@@ -30,10 +40,37 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
 	if (fp->entry_handler)
 		fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
 
+	if (fp->exit_handler) {
+		rh = rethook_try_get(fp->rethook);
+		if (!rh) {
+			fp->nmissed++;
+			goto out;
+		}
+		fpr = container_of(rh, struct fprobe_rethook_node, node);
+		fpr->entry_ip = ip;
+		rethook_hook(rh, ftrace_get_regs(fregs), true);
+	}
+
+out:
 	ftrace_test_recursion_unlock(bit);
 }
 NOKPROBE_SYMBOL(fprobe_handler);
 
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+				struct pt_regs *regs)
+{
+	struct fprobe *fp = (struct fprobe *)data;
+	struct fprobe_rethook_node *fpr;
+
+	if (!fp || fprobe_disabled(fp))
+		return;
+
+	fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+	fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
 /* Convert ftrace location address from symbols */
 static unsigned long *get_ftrace_locations(const char **syms, int num)
 {
@@ -77,6 +114,48 @@ static void fprobe_init(struct fprobe *fp)
 	fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
 }
 
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+	int i, size;
+
+	if (num < 0)
+		return -EINVAL;
+
+	if (!fp->exit_handler) {
+		fp->rethook = NULL;
+		return 0;
+	}
+
+	/* Initialize rethook if needed */
+	size = num * num_possible_cpus() * 2;
+	if (size < 0)
+		return -E2BIG;
+
+	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+	for (i = 0; i < size; i++) {
+		struct rethook_node *node;
+
+		node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL);
+		if (!node) {
+			rethook_free(fp->rethook);
+			fp->rethook = NULL;
+			return -ENOMEM;
+		}
+		rethook_add_node(fp->rethook, node);
+	}
+	return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+	if (fp->rethook) {
+		/* Don't need to cleanup rethook->handler because this is not used. */
+		rethook_free(fp->rethook);
+		fp->rethook = NULL;
+	}
+	ftrace_free_filter(&fp->ops);
+}
+
 /**
  * register_fprobe() - Register fprobe to ftrace by pattern.
  * @fp: A fprobe data structure to be registered.
@@ -90,6 +169,7 @@ static void fprobe_init(struct fprobe *fp)
  */
 int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
 {
+	struct ftrace_hash *hash;
 	unsigned char *str;
 	int ret, len;
 
@@ -114,10 +194,21 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
 			goto out;
 	}
 
-	ret = register_ftrace_function(&fp->ops);
+	/* TODO:
+	 * correctly calculate the total number of filtered symbols
+	 * from both filter and notfilter.
+	 */
+	hash = fp->ops.local_hash.filter_hash;
+	if (WARN_ON_ONCE(!hash))
+		goto out;
+
+	ret = fprobe_init_rethook(fp, (int)hash->count);
+	if (!ret)
+		ret = register_ftrace_function(&fp->ops);
+
 out:
 	if (ret)
-		ftrace_free_filter(&fp->ops);
+		fprobe_fail_cleanup(fp);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
@@ -145,12 +236,15 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	fprobe_init(fp);
 
 	ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+	if (ret)
+		return ret;
+
+	ret = fprobe_init_rethook(fp, num);
 	if (!ret)
 		ret = register_ftrace_function(&fp->ops);
 
 	if (ret)
-		ftrace_free_filter(&fp->ops);
-
+		fprobe_fail_cleanup(fp);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -201,10 +295,20 @@ int unregister_fprobe(struct fprobe *fp)
 	if (!fp || fp->ops.func != fprobe_handler)
 		return -EINVAL;
 
+	/*
+	 * rethook_free() starts disabling the rethook, but the rethook handlers
+	 * may be running on other processors at this point. To make sure that all
+	 * current running handlers are finished, call unregister_ftrace_function()
+	 * after this.
+	 */
+	if (fp->rethook)
+		rethook_free(fp->rethook);
+
 	ret = unregister_ftrace_function(&fp->ops);
+	if (ret < 0)
+		return ret;
 
-	if (!ret)
-		ftrace_free_filter(&fp->ops);
+	ftrace_free_filter(&fp->ops);
 
 	return ret;
 }
-- 
cgit v1.2.3


From ab51e15d535e07be9839e0df056a4ebe9c5bac83 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 15 Mar 2022 23:02:11 +0900
Subject: fprobe: Introduce FPROBE_FL_KPROBE_SHARED flag for fprobe

Introduce FPROBE_FL_KPROBE_SHARED flag for sharing fprobe callback with
kprobes safely from the viewpoint of recursion.

Since the recursion safety of the fprobe (and ftrace) is a bit different
from the kprobes, this may cause an issue if user wants to run the same
code from the fprobe and the kprobes.

The kprobes has per-cpu 'current_kprobe' variable which protects the
kprobe handler from recursion in any case. On the other hand, the fprobe
uses only ftrace_test_recursion_trylock(), which will allow interrupt
context calls another (or same) fprobe during the fprobe user handler is
running.

This is not a matter in cases if the common callback shared among the
kprobes and the fprobe has its own recursion detection, or it can handle
the recursion in the different contexts (normal/interrupt/NMI.)
But if it relies on the 'current_kprobe' recursion lock, it has to check
kprobe_running() and use kprobe_busy_*() APIs.

Fprobe has FPROBE_FL_KPROBE_SHARED flag to do this. If your common callback
code will be shared with kprobes, please set FPROBE_FL_KPROBE_SHARED
*before* registering the fprobe, like;

 fprobe.flags = FPROBE_FL_KPROBE_SHARED;

 register_fprobe(&fprobe, "func*", NULL);

This will protect your common callback from the nested call.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164735293127.1084943.15687374237275817599.stgit@devnote2
---
 kernel/trace/fprobe.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 38073632bfe4..8b2dd5b9dcd1 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -56,6 +56,20 @@ out:
 }
 NOKPROBE_SYMBOL(fprobe_handler);
 
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+				  struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+	struct fprobe *fp = container_of(ops, struct fprobe, ops);
+
+	if (unlikely(kprobe_running())) {
+		fp->nmissed++;
+		return;
+	}
+	kprobe_busy_begin();
+	fprobe_handler(ip, parent_ip, ops, fregs);
+	kprobe_busy_end();
+}
+
 static void fprobe_exit_handler(struct rethook_node *rh, void *data,
 				struct pt_regs *regs)
 {
@@ -110,7 +124,10 @@ error:
 static void fprobe_init(struct fprobe *fp)
 {
 	fp->nmissed = 0;
-	fp->ops.func = fprobe_handler;
+	if (fprobe_shared_with_kprobes(fp))
+		fp->ops.func = fprobe_kprobe_handler;
+	else
+		fp->ops.func = fprobe_handler;
 	fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
 }
 
-- 
cgit v1.2.3


From aecf489f2ce51436402818c96639ed6303b540f8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:08 +0100
Subject: kallsyms: Skip the name search for empty string

When kallsyms_lookup_name is called with empty string,
it will do futile search for it through all the symbols.

Skipping the search for empty string.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-3-jolsa@kernel.org
---
 kernel/kallsyms.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 951c93216fc4..79f2eb617a62 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name)
 	unsigned long i;
 	unsigned int off;
 
+	/* Skip the search for empty string. */
+	if (!*name)
+		return 0;
+
 	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
 		off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
 
-- 
cgit v1.2.3


From 0dcac272540613d41c05e89679e4ddb978b612f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:09 +0100
Subject: bpf: Add multi kprobe link

Adding new link type BPF_LINK_TYPE_KPROBE_MULTI that attaches kprobe
program through fprobe API.

The fprobe API allows to attach probe on multiple functions at once
very fast, because it works on top of ftrace. On the other hand this
limits the probe point to the function entry or return.

The kprobe program gets the same pt_regs input ctx as when it's attached
through the perf API.

Adding new attach type BPF_TRACE_KPROBE_MULTI that allows attachment
kprobe to multiple function with new link.

User provides array of addresses or symbols with count to attach the
kprobe program to. The new link_create uapi interface looks like:

  struct {
          __u32           flags;
          __u32           cnt;
          __aligned_u64   syms;
          __aligned_u64   addrs;
  } kprobe_multi;

The flags field allows single BPF_TRACE_KPROBE_MULTI bit to create
return multi kprobe.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-4-jolsa@kernel.org
---
 kernel/bpf/syscall.c     |  26 ++++--
 kernel/trace/bpf_trace.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 232 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9beb585be5a6..b8bb67ee6c57 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -32,6 +32,7 @@
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
+#include <linux/trace_events.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -3022,6 +3023,11 @@ out_put_file:
 	fput(perf_file);
 	return err;
 }
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_PERF_EVENTS */
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
@@ -4255,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	return -EINVAL;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.addrs
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type ptype;
@@ -4279,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = tracing_bpf_link_attach(attr, uattr, prog);
 		goto out;
 	case BPF_PROG_TYPE_PERF_EVENT:
-	case BPF_PROG_TYPE_KPROBE:
 	case BPF_PROG_TYPE_TRACEPOINT:
 		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
 			ret = -EINVAL;
@@ -4287,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		}
 		ptype = prog->type;
 		break;
+	case BPF_PROG_TYPE_KPROBE:
+		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+		    attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ptype = prog->type;
+		break;
 	default:
 		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
 		if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4318,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
 #endif
-#ifdef CONFIG_PERF_EVENTS
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
-	case BPF_PROG_TYPE_KPROBE:
 		ret = bpf_perf_link_attach(attr, prog);
 		break;
-#endif
+	case BPF_PROG_TYPE_KPROBE:
+		if (attr->link_create.attach_type == BPF_PERF_EVENT)
+			ret = bpf_perf_link_attach(attr, prog);
+		else
+			ret = bpf_kprobe_multi_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a2024ba32a20..fffa2171fae4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,7 @@
 #include <linux/error-injection.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -2181,3 +2182,213 @@ static int __init bpf_event_init(void)
 
 fs_initcall(bpf_event_init);
 #endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+	struct bpf_link link;
+	struct fprobe fp;
+	unsigned long *addrs;
+};
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+	struct bpf_kprobe_multi_link *kmulti_link;
+
+	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+	unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_kprobe_multi_link *kmulti_link;
+
+	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+	kvfree(kmulti_link->addrs);
+	kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+	.release = bpf_kprobe_multi_link_release,
+	.dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+			   struct pt_regs *regs)
+{
+	int err;
+
+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		err = 0;
+		goto out;
+	}
+
+	migrate_disable();
+	rcu_read_lock();
+	err = bpf_prog_run(link->link.prog, regs);
+	rcu_read_unlock();
+	migrate_enable();
+
+ out:
+	__this_cpu_dec(bpf_prog_active);
+	return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+			  struct pt_regs *regs)
+{
+	unsigned long saved_ip = instruction_pointer(regs);
+	struct bpf_kprobe_multi_link *link;
+
+	/*
+	 * Because fprobe's regs->ip is set to the next instruction of
+	 * dynamic-ftrace instruction, correct entry ip must be set, so
+	 * that the bpf program can access entry address via regs as same
+	 * as kprobes.
+	 *
+	 * Both kprobe and kretprobe see the entry ip of traced function
+	 * as instruction pointer.
+	 */
+	instruction_pointer_set(regs, entry_ip);
+
+	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+	kprobe_multi_link_prog_run(link, regs);
+
+	instruction_pointer_set(regs, saved_ip);
+}
+
+static int
+kprobe_multi_resolve_syms(const void *usyms, u32 cnt,
+			  unsigned long *addrs)
+{
+	unsigned long addr, size;
+	const char **syms;
+	int err = -ENOMEM;
+	unsigned int i;
+	char *func;
+
+	size = cnt * sizeof(*syms);
+	syms = kvzalloc(size, GFP_KERNEL);
+	if (!syms)
+		return -ENOMEM;
+
+	func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+	if (!func)
+		goto error;
+
+	if (copy_from_user(syms, usyms, size)) {
+		err = -EFAULT;
+		goto error;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+		if (err == KSYM_NAME_LEN)
+			err = -E2BIG;
+		if (err < 0)
+			goto error;
+		err = -EINVAL;
+		addr = kallsyms_lookup_name(func);
+		if (!addr)
+			goto error;
+		if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+			goto error;
+		addr = ftrace_location_range(addr, addr + size - 1);
+		if (!addr)
+			goto error;
+		addrs[i] = addr;
+	}
+
+	err = 0;
+error:
+	kvfree(syms);
+	kfree(func);
+	return err;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_kprobe_multi_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	unsigned long *addrs;
+	u32 flags, cnt, size;
+	void __user *uaddrs;
+	void __user *usyms;
+	int err;
+
+	/* no support for 32bit archs yet */
+	if (sizeof(u64) != sizeof(void *))
+		return -EOPNOTSUPP;
+
+	if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+		return -EINVAL;
+
+	flags = attr->link_create.kprobe_multi.flags;
+	if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+		return -EINVAL;
+
+	uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+	usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+	if (!!uaddrs == !!usyms)
+		return -EINVAL;
+
+	cnt = attr->link_create.kprobe_multi.cnt;
+	if (!cnt)
+		return -EINVAL;
+
+	size = cnt * sizeof(*addrs);
+	addrs = kvmalloc(size, GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+
+	if (uaddrs) {
+		if (copy_from_user(addrs, uaddrs, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	} else {
+		err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+		if (err)
+			goto error;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_KERNEL);
+	if (!link) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+		      &bpf_kprobe_multi_link_lops, prog);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto error;
+
+	if (flags & BPF_F_KPROBE_MULTI_RETURN)
+		link->fp.exit_handler = kprobe_multi_link_handler;
+	else
+		link->fp.entry_handler = kprobe_multi_link_handler;
+
+	link->addrs = addrs;
+
+	err = register_fprobe_ips(&link->fp, addrs, cnt);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+
+error:
+	kfree(link);
+	kvfree(addrs);
+	return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
-- 
cgit v1.2.3


From 42a5712094e89ef0a125ac0f9d0873f9233368b1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:10 +0100
Subject: bpf: Add bpf_get_func_ip kprobe helper for multi kprobe link

Adding support to call bpf_get_func_ip helper from kprobe
programs attached by multi kprobe link.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-5-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fffa2171fae4..250750932228 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1037,6 +1037,18 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+	return instruction_pointer(regs);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+	.func		= bpf_get_func_ip_kprobe_multi,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
 {
 	struct bpf_trace_run_ctx *run_ctx;
@@ -1280,7 +1292,9 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_override_return_proto;
 #endif
 	case BPF_FUNC_get_func_ip:
-		return &bpf_get_func_ip_proto_kprobe;
+		return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+			&bpf_get_func_ip_proto_kprobe_multi :
+			&bpf_get_func_ip_proto_kprobe;
 	case BPF_FUNC_get_attach_cookie:
 		return &bpf_get_attach_cookie_proto_trace;
 	default:
-- 
cgit v1.2.3


From 97ee4d20ee67eb462581a7af01442de6586e390b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:11 +0100
Subject: bpf: Add support to inline bpf_get_func_ip helper on x86

Adding support to inline it on x86, because it's single
load instruction.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-6-jolsa@kernel.org
---
 kernel/bpf/verifier.c    | 21 ++++++++++++++++++++-
 kernel/trace/bpf_trace.c |  1 +
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cf92f9c01556..0287176bfe9a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13678,7 +13678,7 @@ patch_map_ops_generic:
 			continue;
 		}
 
-		/* Implement bpf_get_func_ip inline. */
+		/* Implement tracing bpf_get_func_ip inline. */
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ip) {
 			/* Load IP address from ctx - 16 */
@@ -13693,6 +13693,25 @@ patch_map_ops_generic:
 			continue;
 		}
 
+#ifdef CONFIG_X86
+		/* Implement kprobe_multi bpf_get_func_ip inline. */
+		if (prog_type == BPF_PROG_TYPE_KPROBE &&
+		    eatype == BPF_TRACE_KPROBE_MULTI &&
+		    insn->imm == BPF_FUNC_get_func_ip) {
+			/* Load IP address from ctx (struct pt_regs) ip */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+						  offsetof(struct pt_regs, ip));
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+			if (!new_prog)
+				return -ENOMEM;
+
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+#endif
+
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 250750932228..0e7f8c9bc756 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1039,6 +1039,7 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 
 BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
 {
+	/* This helper call is inlined by verifier on x86. */
 	return instruction_pointer(regs);
 }
 
-- 
cgit v1.2.3


From ca74823c6e16dd42b7cf60d9fdde80e2a81a67bb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2022 13:24:12 +0100
Subject: bpf: Add cookie support to programs attached with kprobe multi link

Adding support to call bpf_get_attach_cookie helper from
kprobe programs attached with kprobe multi link.

The cookie is provided by array of u64 values, where each
value is paired with provided function address or symbol
with the same array index.

When cookie array is provided it's sorted together with
addresses (check bpf_kprobe_multi_cookie_swap). This way
we can find cookie based on the address in
bpf_get_attach_cookie helper.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220316122419.933957-7-jolsa@kernel.org
---
 kernel/bpf/syscall.c     |   2 +-
 kernel/trace/bpf_trace.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8bb67ee6c57..cdaa1152436a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4261,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	return -EINVAL;
 }
 
-#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.addrs
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
 	enum bpf_prog_type ptype;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0e7f8c9bc756..9a7b6be655e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -18,6 +18,8 @@
 #include <linux/btf_ids.h>
 #include <linux/bpf_lsm.h>
 #include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -78,6 +80,7 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
 				  u64 flags, const struct btf **btf,
 				  s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -1050,6 +1053,18 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+	return bpf_kprobe_multi_cookie(current->bpf_ctx, instruction_pointer(regs));
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+	.func		= bpf_get_attach_cookie_kprobe_multi,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
 {
 	struct bpf_trace_run_ctx *run_ctx;
@@ -1297,7 +1312,9 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			&bpf_get_func_ip_proto_kprobe_multi :
 			&bpf_get_func_ip_proto_kprobe;
 	case BPF_FUNC_get_attach_cookie:
-		return &bpf_get_attach_cookie_proto_trace;
+		return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+			&bpf_get_attach_cookie_proto_kmulti :
+			&bpf_get_attach_cookie_proto_trace;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -2203,6 +2220,13 @@ struct bpf_kprobe_multi_link {
 	struct bpf_link link;
 	struct fprobe fp;
 	unsigned long *addrs;
+	/*
+	 * The run_ctx here is used to get struct bpf_kprobe_multi_link in
+	 * get_attach_cookie helper, so it can't be used to store data.
+	 */
+	struct bpf_run_ctx run_ctx;
+	u64 *cookies;
+	u32 cnt;
 };
 
 static void bpf_kprobe_multi_link_release(struct bpf_link *link)
@@ -2219,6 +2243,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
 
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	kvfree(kmulti_link->addrs);
+	kvfree(kmulti_link->cookies);
 	kfree(kmulti_link);
 }
 
@@ -2227,10 +2252,60 @@ static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
 	.dealloc = bpf_kprobe_multi_link_dealloc,
 };
 
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+	const struct bpf_kprobe_multi_link *link = priv;
+	unsigned long *addr_a = a, *addr_b = b;
+	u64 *cookie_a, *cookie_b;
+	unsigned long tmp1;
+	u64 tmp2;
+
+	cookie_a = link->cookies + (addr_a - link->addrs);
+	cookie_b = link->cookies + (addr_b - link->addrs);
+
+	/* swap addr_a/addr_b and cookie_a/cookie_b values */
+	tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
+	tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+	const unsigned long *addr_a = a, *addr_b = b;
+
+	if (*addr_a == *addr_b)
+		return 0;
+	return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+	return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+{
+	struct bpf_kprobe_multi_link *link;
+	unsigned long *addr;
+	u64 *cookie;
+
+	if (WARN_ON_ONCE(!ctx))
+		return 0;
+	link = container_of(ctx, struct bpf_kprobe_multi_link, run_ctx);
+	if (!link->cookies)
+		return 0;
+	addr = bsearch(&ip, link->addrs, link->cnt, sizeof(ip),
+		       __bpf_kprobe_multi_cookie_cmp);
+	if (!addr)
+		return 0;
+	cookie = link->cookies + (addr - link->addrs);
+	return *cookie;
+}
+
 static int
 kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 			   struct pt_regs *regs)
 {
+	struct bpf_run_ctx *old_run_ctx;
 	int err;
 
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
@@ -2240,7 +2315,9 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 
 	migrate_disable();
 	rcu_read_lock();
+	old_run_ctx = bpf_set_run_ctx(&link->run_ctx);
 	err = bpf_prog_run(link->link.prog, regs);
+	bpf_reset_run_ctx(old_run_ctx);
 	rcu_read_unlock();
 	migrate_enable();
 
@@ -2326,9 +2403,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	struct bpf_kprobe_multi_link *link = NULL;
 	struct bpf_link_primer link_primer;
+	void __user *ucookies;
 	unsigned long *addrs;
 	u32 flags, cnt, size;
 	void __user *uaddrs;
+	u64 *cookies = NULL;
 	void __user *usyms;
 	int err;
 
@@ -2368,6 +2447,19 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 			goto error;
 	}
 
+	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+	if (ucookies) {
+		cookies = kvmalloc(size, GFP_KERNEL);
+		if (!cookies) {
+			err = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user(cookies, ucookies, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	}
+
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link) {
 		err = -ENOMEM;
@@ -2387,6 +2479,21 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		link->fp.entry_handler = kprobe_multi_link_handler;
 
 	link->addrs = addrs;
+	link->cookies = cookies;
+	link->cnt = cnt;
+
+	if (cookies) {
+		/*
+		 * Sorting addresses will trigger sorting cookies as well
+		 * (check bpf_kprobe_multi_cookie_swap). This way we can
+		 * find cookie based on the address in bpf_get_attach_cookie
+		 * helper.
+		 */
+		sort_r(addrs, cnt, sizeof(*addrs),
+		       bpf_kprobe_multi_cookie_cmp,
+		       bpf_kprobe_multi_cookie_swap,
+		       link);
+	}
 
 	err = register_fprobe_ips(&link->fp, addrs, cnt);
 	if (err) {
@@ -2399,6 +2506,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 error:
 	kfree(link);
 	kvfree(addrs);
+	kvfree(cookies);
 	return err;
 }
 #else /* !CONFIG_FPROBE */
@@ -2406,4 +2514,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	return -EOPNOTSUPP;
 }
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+{
+	return 0;
+}
 #endif
-- 
cgit v1.2.3


From 336d4b814bf078fa698488632c19beca47308896 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:15:32 -0600
Subject: ptrace: Move setting/clearing ptrace_message into ptrace_stop

Today ptrace_message is easy to overlook as it not a core part of
ptrace_stop.  It has been overlooked so much that there are places
that set ptrace_message and don't clear it, and places that never set
it.  So if you get an unlucky sequence of events the ptracer may be
able to read a ptrace_message that does not apply to the current
ptrace stop.

Move setting of ptrace_message into ptrace_stop so that it always gets
set before the stop, and always gets cleared after the stop.  This
prevents non-sense from being reported to userspace and makes
ptrace_message more visible in the ptrace helper functions so that
kernel developers can see it.

Link: https://lkml.kernel.org/r/87bky67qfv.fsf_-_@email.froward.int.ebiederm.org
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c2dee5420567..a49ac7149256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2191,7 +2191,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * If we actually decide not to stop at all because the tracer
  * is gone, we keep current->exit_code unless clear_code.
  */
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code,
+			unsigned long message, kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
@@ -2237,6 +2238,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	smp_wmb();
 
+	current->ptrace_message = message;
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
 
@@ -2315,6 +2317,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	spin_lock_irq(&current->sighand->siglock);
 	current->last_siginfo = NULL;
+	current->ptrace_message = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
 	current->jobctl &= ~JOBCTL_LISTENING;
@@ -2327,7 +2330,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	recalc_sigpending_tsk(current);
 }
 
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
 {
 	kernel_siginfo_t info;
 
@@ -2338,17 +2341,17 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	ptrace_stop(exit_code, why, 1, &info);
+	ptrace_stop(exit_code, why, 1, message, &info);
 }
 
-void ptrace_notify(int exit_code)
+void ptrace_notify(int exit_code, unsigned long message)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -2504,10 +2507,10 @@ static void do_jobctl_trap(void)
 			signr = SIGTRAP;
 		WARN_ON_ONCE(!signr);
 		ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
-				 CLD_STOPPED);
+				 CLD_STOPPED, 0);
 	} else {
 		WARN_ON_ONCE(!signr);
-		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
 		current->exit_code = 0;
 	}
 }
@@ -2561,7 +2564,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	ptrace_stop(signr, CLD_TRAPPED, 0, info);
+	ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
 	signr = current->exit_code;
@@ -2891,7 +2894,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		sas_ss_reset(current);
 	if (stepping)
-		ptrace_notify(SIGTRAP);
+		ptrace_notify(SIGTRAP, 0);
 }
 
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
-- 
cgit v1.2.3


From 6487d1dab837214ec2fd3f0ddd5f787e63be7c20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:19:13 -0600
Subject: ptrace: Return the signal to continue with from ptrace_stop

The signal a task should continue with after a ptrace stop is
inconsistently read, cleared, and sent.  Solve this by reading and
clearing the signal to be sent in ptrace_stop.

In an ideal world everything except ptrace_signal would share a common
implementation of continuing with the signal, so ptracers could count
on the signal they ask to continue with actually being delivered.  For
now retain bug compatibility and just return with the signal number
the ptracer requested the code continue with.

Link: https://lkml.kernel.org/r/875yoe7qdp.fsf_-_@email.froward.int.ebiederm.org
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index a49ac7149256..f1c4ab85833c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2188,15 +2188,17 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * That makes it a way to test a stopped process for
  * being ptrace-stopped vs being job-control-stopped.
  *
- * If we actually decide not to stop at all because the tracer
- * is gone, we keep current->exit_code unless clear_code.
+ * Returns the signal the ptracer requested the code resume
+ * with.  If the code did not stop because the tracer is gone,
+ * the stop signal remains unchanged unless clear_code.
  */
-static void ptrace_stop(int exit_code, int why, int clear_code,
+static int ptrace_stop(int exit_code, int why, int clear_code,
 			unsigned long message, kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
 	bool gstop_done = false;
+	bool read_code = true;
 
 	if (arch_ptrace_stop_needed()) {
 		/*
@@ -2305,8 +2307,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 
 		/* tasklist protects us from ptrace_freeze_traced() */
 		__set_current_state(TASK_RUNNING);
+		read_code = false;
 		if (clear_code)
-			current->exit_code = 0;
+			exit_code = 0;
 		read_unlock(&tasklist_lock);
 	}
 
@@ -2316,8 +2319,11 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 	 * any signal-sending on another CPU that wants to examine it.
 	 */
 	spin_lock_irq(&current->sighand->siglock);
+	if (read_code)
+		exit_code = current->exit_code;
 	current->last_siginfo = NULL;
 	current->ptrace_message = 0;
+	current->exit_code = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
 	current->jobctl &= ~JOBCTL_LISTENING;
@@ -2328,9 +2334,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 	 * This sets TIF_SIGPENDING, but never clears it.
 	 */
 	recalc_sigpending_tsk(current);
+	return exit_code;
 }
 
-static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
+static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
 {
 	kernel_siginfo_t info;
 
@@ -2341,18 +2348,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long me
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	ptrace_stop(exit_code, why, 1, message, &info);
+	return ptrace_stop(exit_code, why, 1, message, &info);
 }
 
-void ptrace_notify(int exit_code, unsigned long message)
+int ptrace_notify(int exit_code, unsigned long message)
 {
+	int signr;
+
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
+	signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
 	spin_unlock_irq(&current->sighand->siglock);
+	return signr;
 }
 
 /**
@@ -2511,7 +2521,6 @@ static void do_jobctl_trap(void)
 	} else {
 		WARN_ON_ONCE(!signr);
 		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
-		current->exit_code = 0;
 	}
 }
 
@@ -2564,15 +2573,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
+	signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
-	signr = current->exit_code;
 	if (signr == 0)
 		return signr;
 
-	current->exit_code = 0;
-
 	/*
 	 * Update the siginfo structure if the signal has
 	 * changed.  If the debugger wanted something
-- 
cgit v1.2.3


From edc3ec09ab706c45e955f7a52f0904b4ed649ca9 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 17 Mar 2022 17:29:43 +0530
Subject: bpf: Factor out fd returning from bpf_btf_find_by_name_kind

In next few patches, we need a helper that searches all kernel BTFs
(vmlinux and module BTFs), and finds the type denoted by 'name' and
'kind'. Turns out bpf_btf_find_by_name_kind already does the same thing,
but it instead returns a BTF ID and optionally fd (if module BTF). This
is used for relocating ksyms in BPF loader code (bpftool gen skel -L).

We extract the core code out into a new helper bpf_find_btf_id, which
returns the BTF ID in the return value, and BTF pointer in an out
parameter. The reference for the returned BTF pointer is always raised,
hence user must either transfer it (e.g. to a fd), or release it after
use.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220317115957.3193097-2-memxor@gmail.com
---
 kernel/bpf/btf.c | 90 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 8b34563a832e..17b9adcd88d3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -525,6 +525,48 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
 	return -ENOENT;
 }
 
+static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+	struct btf *btf;
+	s32 ret;
+	int id;
+
+	btf = bpf_get_btf_vmlinux();
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	ret = btf_find_by_name_kind(btf, name, kind);
+	/* ret is never zero, since btf_find_by_name_kind returns
+	 * positive btf_id or negative error.
+	 */
+	if (ret > 0) {
+		btf_get(btf);
+		*btf_p = btf;
+		return ret;
+	}
+
+	/* If name is not found in vmlinux's BTF then search in module's BTFs */
+	spin_lock_bh(&btf_idr_lock);
+	idr_for_each_entry(&btf_idr, btf, id) {
+		if (!btf_is_module(btf))
+			continue;
+		/* linear search could be slow hence unlock/lock
+		 * the IDR to avoiding holding it for too long
+		 */
+		btf_get(btf);
+		spin_unlock_bh(&btf_idr_lock);
+		ret = btf_find_by_name_kind(btf, name, kind);
+		if (ret > 0) {
+			*btf_p = btf;
+			return ret;
+		}
+		spin_lock_bh(&btf_idr_lock);
+		btf_put(btf);
+	}
+	spin_unlock_bh(&btf_idr_lock);
+	return ret;
+}
+
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
 					       u32 id, u32 *res_id)
 {
@@ -6562,7 +6604,8 @@ static struct btf *btf_get_module_btf(const struct module *module)
 
 BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
 {
-	struct btf *btf;
+	struct btf *btf = NULL;
+	int btf_obj_fd = 0;
 	long ret;
 
 	if (flags)
@@ -6571,44 +6614,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
 	if (name_sz <= 1 || name[name_sz - 1])
 		return -EINVAL;
 
-	btf = bpf_get_btf_vmlinux();
-	if (IS_ERR(btf))
-		return PTR_ERR(btf);
-
-	ret = btf_find_by_name_kind(btf, name, kind);
-	/* ret is never zero, since btf_find_by_name_kind returns
-	 * positive btf_id or negative error.
-	 */
-	if (ret < 0) {
-		struct btf *mod_btf;
-		int id;
-
-		/* If name is not found in vmlinux's BTF then search in module's BTFs */
-		spin_lock_bh(&btf_idr_lock);
-		idr_for_each_entry(&btf_idr, mod_btf, id) {
-			if (!btf_is_module(mod_btf))
-				continue;
-			/* linear search could be slow hence unlock/lock
-			 * the IDR to avoiding holding it for too long
-			 */
-			btf_get(mod_btf);
-			spin_unlock_bh(&btf_idr_lock);
-			ret = btf_find_by_name_kind(mod_btf, name, kind);
-			if (ret > 0) {
-				int btf_obj_fd;
-
-				btf_obj_fd = __btf_new_fd(mod_btf);
-				if (btf_obj_fd < 0) {
-					btf_put(mod_btf);
-					return btf_obj_fd;
-				}
-				return ret | (((u64)btf_obj_fd) << 32);
-			}
-			spin_lock_bh(&btf_idr_lock);
-			btf_put(mod_btf);
+	ret = bpf_find_btf_id(name, kind, &btf);
+	if (ret > 0 && btf_is_module(btf)) {
+		btf_obj_fd = __btf_new_fd(btf);
+		if (btf_obj_fd < 0) {
+			btf_put(btf);
+			return btf_obj_fd;
 		}
-		spin_unlock_bh(&btf_idr_lock);
+		return ret | (((u64)btf_obj_fd) << 32);
 	}
+	if (ret > 0)
+		btf_put(btf);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9492450fd28736262dea9143ebb3afc2c131ace1 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 17 Mar 2022 17:29:51 +0530
Subject: bpf: Always raise reference in btf_get_module_btf

Align it with helpers like bpf_find_btf_id, so all functions returning
BTF in out parameter follow the same rule of raising reference
consistently, regardless of module or vmlinux BTF.

Adjust existing callers to handle the change accordinly.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220317115957.3193097-10-memxor@gmail.com
---
 kernel/bpf/btf.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 17b9adcd88d3..6d9e711cb5d4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6572,20 +6572,23 @@ struct module *btf_try_get_module(const struct btf *btf)
 	return res;
 }
 
-/* Returns struct btf corresponding to the struct module
- *
- * This function can return NULL or ERR_PTR. Note that caller must
- * release reference for struct btf iff btf_is_module is true.
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
  */
 static struct btf *btf_get_module_btf(const struct module *module)
 {
-	struct btf *btf = NULL;
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	struct btf_module *btf_mod, *tmp;
 #endif
+	struct btf *btf = NULL;
+
+	if (!module) {
+		btf = bpf_get_btf_vmlinux();
+		if (!IS_ERR(btf))
+			btf_get(btf);
+		return btf;
+	}
 
-	if (!module)
-		return bpf_get_btf_vmlinux();
 #ifdef CONFIG_DEBUG_INFO_BTF_MODULES
 	mutex_lock(&btf_module_mutex);
 	list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6823,9 +6826,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 
 	hook = bpf_prog_type_to_kfunc_hook(prog_type);
 	ret = btf_populate_kfunc_set(btf, hook, kset);
-	/* reference is only taken for module BTF */
-	if (btf_is_module(btf))
-		btf_put(btf);
+	btf_put(btf);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
-- 
cgit v1.2.3


From 795301d3c28996219d555023ac6863401b6076bc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 18 Mar 2022 15:34:32 -0400
Subject: tracing: Have type enum modifications copy the strings

When an enum is used in the visible parts of a trace event that is
exported to user space, the user space applications like perf and
trace-cmd do not have a way to know what the value of the enum is. To
solve this, at boot up (or module load) the printk formats are modified to
replace the enum with their numeric value in the string output.

Array fields of the event are defined by [<nr-elements>] in the type
portion of the format file so that the user space parsers can correctly
parse the array into the appropriate size chunks. But in some trace
events, an enum is used in defining the size of the array, which once
again breaks the parsing of user space tooling.

This was solved the same way as the print formats were, but it modified
the type strings of the trace event. This caused crashes in some
architectures because, as supposed to the print string, is a const string
value. This was not detected on x86, as it appears that const strings are
still writable (at least in boot up), but other architectures this is not
the case, and writing to a const string will cause a kernel fault.

To fix this, use kstrdup() to copy the type before modifying it. If the
trace event is for the core kernel there's no need to free it because the
string will be in use for the life of the machine being on line. For
modules, create a link list to store all the strings being allocated for
modules and when the module is removed, free them.

Link: https://lore.kernel.org/all/yt9dr1706b4i.fsf@linux.ibm.com/
Link: https://lkml.kernel.org/r/20220318153432.3984b871@gandalf.local.home

Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Reported-by: Sven Schnelle <svens@linux.ibm.com>
Fixes: b3bc8547d3be ("tracing: Have TRACE_DEFINE_ENUM affect trace event types as well")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ae9a3b8481f5..0d91152172c9 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
 static LIST_HEAD(ftrace_common_fields);
 static bool eventdir_initialized;
 
+static LIST_HEAD(module_strings);
+
+struct module_string {
+	struct list_head	next;
+	struct module		*module;
+	char			*str;
+};
+
 #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
 
 static struct kmem_cache *field_cachep;
@@ -2633,14 +2641,40 @@ static void update_event_printk(struct trace_event_call *call,
 	}
 }
 
+static void add_str_to_module(struct module *module, char *str)
+{
+	struct module_string *modstr;
+
+	modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+	/*
+	 * If we failed to allocate memory here, then we'll just
+	 * let the str memory leak when the module is removed.
+	 * If this fails to allocate, there's worse problems than
+	 * a leaked string on module removal.
+	 */
+	if (WARN_ON_ONCE(!modstr))
+		return;
+
+	modstr->module = module;
+	modstr->str = str;
+
+	list_add(&modstr->next, &module_strings);
+}
+
 static void update_event_fields(struct trace_event_call *call,
 				struct trace_eval_map *map)
 {
 	struct ftrace_event_field *field;
 	struct list_head *head;
 	char *ptr;
+	char *str;
 	int len = strlen(map->eval_string);
 
+	/* Dynamic events should never have field maps */
+	if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+		return;
+
 	head = trace_get_fields(call);
 	list_for_each_entry(field, head, link) {
 		ptr = strchr(field->type, '[');
@@ -2654,9 +2688,26 @@ static void update_event_fields(struct trace_event_call *call,
 		if (strncmp(map->eval_string, ptr, len) != 0)
 			continue;
 
+		str = kstrdup(field->type, GFP_KERNEL);
+		if (WARN_ON_ONCE(!str))
+			return;
+		ptr = str + (ptr - field->type);
 		ptr = eval_replace(ptr, map, len);
 		/* enum/sizeof string smaller than value */
-		WARN_ON_ONCE(!ptr);
+		if (WARN_ON_ONCE(!ptr)) {
+			kfree(str);
+			continue;
+		}
+
+		/*
+		 * If the event is part of a module, then we need to free the string
+		 * when the module is removed. Otherwise, it will stay allocated
+		 * until a reboot.
+		 */
+		if (call->module)
+			add_str_to_module(call->module, str);
+
+		field->type = str;
 	}
 }
 
@@ -2883,6 +2934,7 @@ static void trace_module_add_events(struct module *mod)
 static void trace_module_remove_events(struct module *mod)
 {
 	struct trace_event_call *call, *p;
+	struct module_string *modstr, *m;
 
 	down_write(&trace_event_sem);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2891,6 +2943,14 @@ static void trace_module_remove_events(struct module *mod)
 		if (call->module == mod)
 			__trace_remove_event_call(call);
 	}
+	/* Check for any strings allocade for this module */
+	list_for_each_entry_safe(modstr, m, &module_strings, next) {
+		if (modstr->module != mod)
+			continue;
+		list_del(&modstr->next);
+		kfree(modstr->str);
+		kfree(modstr);
+	}
 	up_write(&trace_event_sem);
 
 	/*
-- 
cgit v1.2.3


From b00fa38a9c1cba044a32a601b49a55a18ed719d1 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 17 Mar 2022 21:55:52 -0700
Subject: bpf: Enable non-atomic allocations in local storage

Currently, local storage memory can only be allocated atomically
(GFP_ATOMIC). This restriction is too strict for sleepable bpf
programs.

In this patch, the verifier detects whether the program is sleepable,
and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a
5th argument to bpf_task/sk/inode_storage_get. This flag will propagate
down to the local storage functions that allocate memory.

Please note that bpf_task/sk/inode_storage_update_elem functions are
invoked by userspace applications through syscalls. Preemption is
disabled before bpf_task/sk/inode_storage_update_elem is called, which
means they will always have to allocate memory atomically.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220318045553.3091807-2-joannekoong@fb.com
---
 kernel/bpf/bpf_inode_storage.c |  9 ++++---
 kernel/bpf/bpf_local_storage.c | 58 +++++++++++++++++++++++++++---------------
 kernel/bpf/bpf_task_storage.c  | 10 +++++---
 kernel/bpf/verifier.c          | 20 +++++++++++++++
 4 files changed, 68 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e29d9e3d853e..96be8d518885 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
 
 	sdata = bpf_local_storage_update(f->f_inode,
 					 (struct bpf_local_storage_map *)map,
-					 value, map_flags);
+					 value, map_flags, GFP_ATOMIC);
 	fput(f);
 	return PTR_ERR_OR_ZERO(sdata);
 }
@@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 	return err;
 }
 
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
-	   void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+	   void *, value, u64, flags, gfp_t, gfp_flags)
 {
 	struct bpf_local_storage_data *sdata;
 
@@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
 	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
 		sdata = bpf_local_storage_update(
 			inode, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST);
+			BPF_NOEXIST, gfp_flags);
 		return IS_ERR(sdata) ? (unsigned long)NULL :
 					     (unsigned long)sdata->data;
 	}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 092a1ac772d7..01aa2b51ec4d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
 
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
-		void *value, bool charge_mem)
+		void *value, bool charge_mem, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_elem *selem;
 
@@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 		return NULL;
 
 	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				GFP_ATOMIC | __GFP_NOWARN);
+				gfp_flags | __GFP_NOWARN);
 	if (selem) {
 		if (value)
 			memcpy(SDATA(selem)->data, value, smap->map.value_size);
@@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
 
 int bpf_local_storage_alloc(void *owner,
 			    struct bpf_local_storage_map *smap,
-			    struct bpf_local_storage_elem *first_selem)
+			    struct bpf_local_storage_elem *first_selem,
+			    gfp_t gfp_flags)
 {
 	struct bpf_local_storage *prev_storage, *storage;
 	struct bpf_local_storage **owner_storage_ptr;
@@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner,
 		return err;
 
 	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  GFP_ATOMIC | __GFP_NOWARN);
+				  gfp_flags | __GFP_NOWARN);
 	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
@@ -350,10 +351,10 @@ uncharge:
  */
 struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
-			 void *value, u64 map_flags)
+			 void *value, u64 map_flags, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
-	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_elem *selem = NULL;
 	struct bpf_local_storage *local_storage;
 	unsigned long flags;
 	int err;
@@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		     !map_value_has_spin_lock(&smap->map)))
 		return ERR_PTR(-EINVAL);
 
+	if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+		return ERR_PTR(-EINVAL);
+
 	local_storage = rcu_dereference_check(*owner_storage(smap, owner),
 					      bpf_rcu_lock_held());
 	if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		if (err)
 			return ERR_PTR(err);
 
-		selem = bpf_selem_alloc(smap, owner, value, true);
+		selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
 		if (!selem)
 			return ERR_PTR(-ENOMEM);
 
-		err = bpf_local_storage_alloc(owner, smap, selem);
+		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
 			kfree(selem);
 			mem_uncharge(smap, owner, smap->elem_size);
@@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
+	if (gfp_flags == GFP_KERNEL) {
+		selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+		if (!selem)
+			return ERR_PTR(-ENOMEM);
+	}
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
 	/* Recheck local_storage->list under local_storage->lock */
@@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		goto unlock;
 	}
 
-	/* local_storage->lock is held.  Hence, we are sure
-	 * we can unlink and uncharge the old_sdata successfully
-	 * later.  Hence, instead of charging the new selem now
-	 * and then uncharge the old selem later (which may cause
-	 * a potential but unnecessary charge failure),  avoid taking
-	 * a charge at all here (the "!old_sdata" check) and the
-	 * old_sdata will not be uncharged later during
-	 * bpf_selem_unlink_storage_nolock().
-	 */
-	selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
-	if (!selem) {
-		err = -ENOMEM;
-		goto unlock_err;
+	if (gfp_flags != GFP_KERNEL) {
+		/* local_storage->lock is held.  Hence, we are sure
+		 * we can unlink and uncharge the old_sdata successfully
+		 * later.  Hence, instead of charging the new selem now
+		 * and then uncharge the old selem later (which may cause
+		 * a potential but unnecessary charge failure),  avoid taking
+		 * a charge at all here (the "!old_sdata" check) and the
+		 * old_sdata will not be uncharged later during
+		 * bpf_selem_unlink_storage_nolock().
+		 */
+		selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
+		if (!selem) {
+			err = -ENOMEM;
+			goto unlock_err;
+		}
 	}
 
 	/* First, link the new selem to the map */
@@ -463,6 +475,10 @@ unlock:
 
 unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	if (selem) {
+		mem_uncharge(smap, owner, smap->elem_size);
+		kfree(selem);
+	}
 	return ERR_PTR(err);
 }
 
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 5da7bed0f5f6..6638a0ecc3d2 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 
 	bpf_task_storage_lock();
 	sdata = bpf_local_storage_update(
-		task, (struct bpf_local_storage_map *)map, value, map_flags);
+		task, (struct bpf_local_storage_map *)map, value, map_flags,
+		GFP_ATOMIC);
 	bpf_task_storage_unlock();
 
 	err = PTR_ERR_OR_ZERO(sdata);
@@ -226,8 +227,9 @@ out:
 	return err;
 }
 
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
-	   task, void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+	   task, void *, value, u64, flags, gfp_t, gfp_flags)
 {
 	struct bpf_local_storage_data *sdata;
 
@@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
 	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
 		sdata = bpf_local_storage_update(
 			task, (struct bpf_local_storage_map *)map, value,
-			BPF_NOEXIST);
+			BPF_NOEXIST, gfp_flags);
 
 unlock:
 	bpf_task_storage_unlock();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0287176bfe9a..6347dcdee1fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13492,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		if (insn->imm == BPF_FUNC_task_storage_get ||
+		    insn->imm == BPF_FUNC_sk_storage_get ||
+		    insn->imm == BPF_FUNC_inode_storage_get) {
+			if (env->prog->aux->sleepable)
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_KERNEL);
+			else
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_ATOMIC);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
-- 
cgit v1.2.3


From ef078600eec20f20eb7833cf597d4a5edf2953c1 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 11 Mar 2022 12:11:35 -0800
Subject: bpf: Select proper size for bpf_prog_pack

Using HPAGE_PMD_SIZE as the size for bpf_prog_pack is not ideal in some
cases. Specifically, for NUMA systems, __vmalloc_node_range requires
PMD_SIZE * num_online_nodes() to allocate huge pages. Also, if the system
does not support huge pages (i.e., with cmdline option nohugevmalloc), it
is better to use PAGE_SIZE packs.

Add logic to select proper size for bpf_prog_pack. This solution is not
ideal, as it makes assumption about the behavior of module_alloc and
__vmalloc_node_range. However, it appears to be the easiest solution as
it doesn't require changes in module_alloc and vmalloc code.

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220311201135.3573610-1-song@kernel.org
---
 kernel/bpf/core.c | 70 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1324f9523e7c..9d661e07e77c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -33,6 +33,7 @@
 #include <linux/extable.h>
 #include <linux/log2.h>
 #include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
 
 #include <asm/barrier.h>
 #include <asm/unaligned.h>
@@ -815,15 +816,9 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
  * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
  * to host BPF programs.
  */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
-#else
-#define BPF_PROG_PACK_SIZE	PAGE_SIZE
-#endif
 #define BPF_PROG_CHUNK_SHIFT	6
 #define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
 #define BPF_PROG_CHUNK_MASK	(~(BPF_PROG_CHUNK_SIZE - 1))
-#define BPF_PROG_CHUNK_COUNT	(BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
 
 struct bpf_prog_pack {
 	struct list_head list;
@@ -831,30 +826,56 @@ struct bpf_prog_pack {
 	unsigned long bitmap[];
 };
 
-#define BPF_PROG_MAX_PACK_PROG_SIZE	BPF_PROG_PACK_SIZE
 #define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
 
+static size_t bpf_prog_pack_size = -1;
+
+static int bpf_prog_chunk_count(void)
+{
+	WARN_ON_ONCE(bpf_prog_pack_size == -1);
+	return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
+}
+
 static DEFINE_MUTEX(pack_mutex);
 static LIST_HEAD(pack_list);
 
+static size_t select_bpf_prog_pack_size(void)
+{
+	size_t size;
+	void *ptr;
+
+	size = PMD_SIZE * num_online_nodes();
+	ptr = module_alloc(size);
+
+	/* Test whether we can get huge pages. If not just use PAGE_SIZE
+	 * packs.
+	 */
+	if (!ptr || !is_vm_area_hugepages(ptr))
+		size = PAGE_SIZE;
+
+	vfree(ptr);
+	return size;
+}
+
 static struct bpf_prog_pack *alloc_new_pack(void)
 {
 	struct bpf_prog_pack *pack;
 
-	pack = kzalloc(sizeof(*pack) + BITS_TO_BYTES(BPF_PROG_CHUNK_COUNT), GFP_KERNEL);
+	pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+		       GFP_KERNEL);
 	if (!pack)
 		return NULL;
-	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+	pack->ptr = module_alloc(bpf_prog_pack_size);
 	if (!pack->ptr) {
 		kfree(pack);
 		return NULL;
 	}
-	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
 	list_add_tail(&pack->list, &pack_list);
 
 	set_vm_flush_reset_perms(pack->ptr);
-	set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
-	set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+	set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
 	return pack;
 }
 
@@ -865,7 +886,11 @@ static void *bpf_prog_pack_alloc(u32 size)
 	unsigned long pos;
 	void *ptr = NULL;
 
-	if (size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+	mutex_lock(&pack_mutex);
+	if (bpf_prog_pack_size == -1)
+		bpf_prog_pack_size = select_bpf_prog_pack_size();
+
+	if (size > bpf_prog_pack_size) {
 		size = round_up(size, PAGE_SIZE);
 		ptr = module_alloc(size);
 		if (ptr) {
@@ -873,13 +898,12 @@ static void *bpf_prog_pack_alloc(u32 size)
 			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
 			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
 		}
-		return ptr;
+		goto out;
 	}
-	mutex_lock(&pack_mutex);
 	list_for_each_entry(pack, &pack_list, list) {
-		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+		pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
 						 nbits, 0);
-		if (pos < BPF_PROG_CHUNK_COUNT)
+		if (pos < bpf_prog_chunk_count())
 			goto found_free_area;
 	}
 
@@ -905,13 +929,13 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	unsigned long pos;
 	void *pack_ptr;
 
-	if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) {
+	mutex_lock(&pack_mutex);
+	if (hdr->size > bpf_prog_pack_size) {
 		module_memfree(hdr);
-		return;
+		goto out;
 	}
 
-	pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
-	mutex_lock(&pack_mutex);
+	pack_ptr = (void *)((unsigned long)hdr & ~(bpf_prog_pack_size - 1));
 
 	list_for_each_entry(tmp, &pack_list, list) {
 		if (tmp->ptr == pack_ptr) {
@@ -927,8 +951,8 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
 
 	bitmap_clear(pack->bitmap, pos, nbits);
-	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
-				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
+	if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+				       bpf_prog_chunk_count(), 0) == 0) {
 		list_del(&pack->list);
 		module_memfree(pack->ptr);
 		kfree(pack);
-- 
cgit v1.2.3


From ee2a098851bfbe8bcdd964c0121f4246f00ff41e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 14 Mar 2022 11:20:41 -0700
Subject: bpf: Adjust BPF stack helper functions to accommodate skip > 0

Let's say that the caller has storage for num_elem stack frames.  Then,
the BPF stack helper functions walk the stack for only num_elem frames.
This means that if skip > 0, one keeps only 'num_elem - skip' frames.

This is because it sets init_nr in the perf_callchain_entry to the end
of the buffer to save num_elem entries only.  I believe it was because
the perf callchain code unwound the stack frames until it reached the
global max size (sysctl_perf_event_max_stack).

However it now has perf_callchain_entry_ctx.max_stack to limit the
iteration locally.  This simplifies the code to handle init_nr in the
BPF callstack entries and removes the confusion with the perf_event's
__PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0.

Also change the comment on bpf_get_stack() in the header file to be
more explicit what the return value means.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com
Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org

Based-on-patch-by: Eugene Loh <eugene.loh@oracle.com>
---
 kernel/bpf/stackmap.c | 56 ++++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 38bdfcd06f55..34725bfa1e97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -176,7 +176,7 @@ build_id_valid:
 }
 
 static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
 {
 #ifdef CONFIG_STACKTRACE
 	struct perf_callchain_entry *entry;
@@ -187,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 	if (!entry)
 		return NULL;
 
-	entry->nr = init_nr +
-		stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
-				     sysctl_perf_event_max_stack - init_nr, 0);
+	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+					 max_depth, 0);
 
 	/* stack_trace_save_tsk() works on unsigned long array, while
 	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -201,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 		int i;
 
 		/* copy data from the end to avoid using extra buffer */
-		for (i = entry->nr - 1; i >= (int)init_nr; i--)
+		for (i = entry->nr - 1; i >= 0; i--)
 			to[i] = (u64)(from[i]);
 	}
 
@@ -218,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-	u32 max_depth = map->value_size / stack_map_data_size(map);
-	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
-	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
 	u64 *ips;
 	bool hash_matches;
 
-	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
-	 */
-	trace_nr = trace->nr - init_nr;
-
-	if (trace_nr <= skip)
+	if (trace->nr <= skip)
 		/* skipping more than usable stack trace */
 		return -EFAULT;
 
-	trace_nr -= skip;
+	trace_nr = trace->nr - skip;
 	trace_len = trace_nr * sizeof(u64);
-	ips = trace->ip + skip + init_nr;
+	ips = trace->ip + skip;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
 	bucket = READ_ONCE(smap->buckets[id]);
@@ -295,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags)
 {
 	u32 max_depth = map->value_size / stack_map_data_size(map);
-	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
-	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
 	struct perf_callchain_entry *trace;
 	bool kernel = !user;
@@ -305,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
+	max_depth += skip;
+	if (max_depth > sysctl_perf_event_max_stack)
+		max_depth = sysctl_perf_event_max_stack;
+
+	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+				   false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -397,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 			    struct perf_callchain_entry *trace_in,
 			    void *buf, u32 size, u64 flags)
 {
-	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
@@ -422,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		goto err_fault;
 
 	num_elem = size / elem_size;
-	if (sysctl_perf_event_max_stack < num_elem)
-		init_nr = 0;
-	else
-		init_nr = sysctl_perf_event_max_stack - num_elem;
+	max_depth = num_elem + skip;
+	if (sysctl_perf_event_max_stack < max_depth)
+		max_depth = sysctl_perf_event_max_stack;
 
 	if (trace_in)
 		trace = trace_in;
 	else if (kernel && task)
-		trace = get_callchain_entry_for_task(task, init_nr);
+		trace = get_callchain_entry_for_task(task, max_depth);
 	else
-		trace = get_perf_callchain(regs, init_nr, kernel, user,
-					   sysctl_perf_event_max_stack,
+		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
 					   false, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
-	trace_nr = trace->nr - init_nr;
-	if (trace_nr < skip)
+	if (trace->nr < skip)
 		goto err_fault;
 
-	trace_nr -= skip;
+	trace_nr = trace->nr - skip;
 	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
 	copy_len = trace_nr * elem_size;
-	ips = trace->ip + skip + init_nr;
+
+	ips = trace->ip + skip;
 	if (user && user_build_id)
 		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
 	else
-- 
cgit v1.2.3


From 7ada3787e91c89b0aa7abf47682e8e587b855c13 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 20 Mar 2022 20:00:03 +0530
Subject: bpf: Check for NULL return from bpf_get_btf_vmlinux

When CONFIG_DEBUG_INFO_BTF is disabled, bpf_get_btf_vmlinux can return a
NULL pointer. Check for it in btf_get_module_btf to prevent a NULL pointer
dereference.

While kernel test robot only complained about this specific case, let's
also check for NULL in other call sites of bpf_get_btf_vmlinux.

Fixes: 9492450fd287 ("bpf: Always raise reference in btf_get_module_btf")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220320143003.589540-1-memxor@gmail.com
---
 kernel/bpf/btf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6d9e711cb5d4..ce212bf39b2b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -534,6 +534,8 @@ static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
 	btf = bpf_get_btf_vmlinux();
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
+	if (!btf)
+		return -EINVAL;
 
 	ret = btf_find_by_name_kind(btf, name, kind);
 	/* ret is never zero, since btf_find_by_name_kind returns
@@ -6584,7 +6586,7 @@ static struct btf *btf_get_module_btf(const struct module *module)
 
 	if (!module) {
 		btf = bpf_get_btf_vmlinux();
-		if (!IS_ERR(btf))
+		if (!IS_ERR_OR_NULL(btf))
 			btf_get(btf);
 		return btf;
 	}
@@ -7180,6 +7182,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
 	main_btf = bpf_get_btf_vmlinux();
 	if (IS_ERR(main_btf))
 		return ERR_CAST(main_btf);
+	if (!main_btf)
+		return ERR_PTR(-EINVAL);
 
 	local_type = btf_type_by_id(local_btf, local_type_id);
 	if (!local_type)
-- 
cgit v1.2.3


From a635415a064e77bcfbf43da413fd9dfe0bbed9cb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 21 Mar 2022 08:11:52 +0000
Subject: watch_queue: Fix NULL dereference in error cleanup

In watch_queue_set_size(), the error cleanup code doesn't take account of
the fact that __free_page() can't handle a NULL pointer when trying to free
up buffer pages that did get allocated.

Fix this by only calling __free_page() on the pages actually allocated.

Without the fix, this can lead to something like the following:

BUG: KASAN: null-ptr-deref in __free_pages+0x1f/0x1b0 mm/page_alloc.c:5473
Read of size 4 at addr 0000000000000034 by task syz-executor168/3599
...
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
 __kasan_report mm/kasan/report.c:446 [inline]
 kasan_report.cold+0x66/0xdf mm/kasan/report.c:459
 check_region_inline mm/kasan/generic.c:183 [inline]
 kasan_check_range+0x13d/0x180 mm/kasan/generic.c:189
 instrument_atomic_read include/linux/instrumented.h:71 [inline]
 atomic_read include/linux/atomic/atomic-instrumented.h:27 [inline]
 page_ref_count include/linux/page_ref.h:67 [inline]
 put_page_testzero include/linux/mm.h:717 [inline]
 __free_pages+0x1f/0x1b0 mm/page_alloc.c:5473
 watch_queue_set_size+0x499/0x630 kernel/watch_queue.c:275
 pipe_ioctl+0xac/0x2b0 fs/pipe.c:632
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:874 [inline]
 __se_sys_ioctl fs/ioctl.c:860 [inline]
 __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Fixes: c73be61cede5 ("pipe: Add general notification queue support")
Reported-and-tested-by: syzbot+d55757faa9b80590767b@syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
---
 kernel/watch_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 00703444a219..5848d4795816 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -271,7 +271,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
 	return 0;
 
 error_p:
-	for (i = 0; i < nr_pages; i++)
+	while (--i >= 0)
 		__free_page(pages[i]);
 	kfree(pages);
 error:
-- 
cgit v1.2.3


From 3d8dcf278b1ee1eff1e90be848fa2237db4c07a7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 21 Mar 2022 11:18:54 +0000
Subject: watch_queue: Actually free the watch

free_watch() does everything barring actually freeing the watch object.  Fix
this by adding the missing kfree.

kmemleak produces a report something like the following.  Note that as an
address can be seen in the first word, the watch would appear to have gone
through call_rcu().

BUG: memory leak
unreferenced object 0xffff88810ce4a200 (size 96):
  comm "syz-executor352", pid 3605, jiffies 4294947473 (age 13.720s)
  hex dump (first 32 bytes):
    e0 82 48 0d 81 88 ff ff 00 00 00 00 00 00 00 00  ..H.............
    80 a2 e4 0c 81 88 ff ff 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff8214e6cc>] kmalloc include/linux/slab.h:581 [inline]
    [<ffffffff8214e6cc>] kzalloc include/linux/slab.h:714 [inline]
    [<ffffffff8214e6cc>] keyctl_watch_key+0xec/0x2e0 security/keys/keyctl.c:1800
    [<ffffffff8214ec84>] __do_sys_keyctl+0x3c4/0x490 security/keys/keyctl.c:2016
    [<ffffffff84493a25>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
    [<ffffffff84493a25>] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
    [<ffffffff84600068>] entry_SYSCALL_64_after_hwframe+0x44/0xae

Fixes: c73be61cede5 ("pipe: Add general notification queue support")
Reported-and-tested-by: syzbot+6e2de48f06cdb2884bfc@syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells@redhat.com>
---
 kernel/watch_queue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 5848d4795816..3990e4df3d7b 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -395,6 +395,7 @@ static void free_watch(struct rcu_head *rcu)
 	put_watch_queue(rcu_access_pointer(watch->queue));
 	atomic_dec(&watch->cred->user->nr_watches);
 	put_cred(watch->cred);
+	kfree(watch);
 }
 
 static void __put_watch(struct kref *kref)
-- 
cgit v1.2.3


From 583669ab3aed29994e50bde6c66b52d44e1bdb73 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sun, 20 Mar 2022 15:52:40 +0800
Subject: bpf: Simplify check in btf_parse_hdr()

Replace offsetof(hdr_len) + sizeof(hdr_len) with offsetofend(hdr_len) to
simplify the check for correctness of btf_data_size in btf_parse_hdr()

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220320075240.1001728-1-ytcoode@gmail.com
---
 kernel/bpf/btf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ce212bf39b2b..24788ce564a0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4482,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 	btf = env->btf;
 	btf_data_size = btf->data_size;
 
-	if (btf_data_size <
-	    offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+	if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
 		btf_verifier_log(env, "hdr_len not found");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From f705ec764b34323412f14b9bd95412e9bcb8770b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 21 Mar 2022 08:01:12 +0100
Subject: Revert "bpf: Add support to inline bpf_get_func_ip helper on x86"

This reverts commit 97ee4d20ee67eb462581a7af01442de6586e390b.

Following change is adding more complexity to bpf_get_func_ip
helper for kprobe_multi programs, which can't be inlined easily.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220321070113.1449167-2-jolsa@kernel.org
---
 kernel/bpf/verifier.c    | 21 +--------------------
 kernel/trace/bpf_trace.c |  1 -
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6347dcdee1fd..571ccd7f04eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13698,7 +13698,7 @@ patch_map_ops_generic:
 			continue;
 		}
 
-		/* Implement tracing bpf_get_func_ip inline. */
+		/* Implement bpf_get_func_ip inline. */
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ip) {
 			/* Load IP address from ctx - 16 */
@@ -13713,25 +13713,6 @@ patch_map_ops_generic:
 			continue;
 		}
 
-#ifdef CONFIG_X86
-		/* Implement kprobe_multi bpf_get_func_ip inline. */
-		if (prog_type == BPF_PROG_TYPE_KPROBE &&
-		    eatype == BPF_TRACE_KPROBE_MULTI &&
-		    insn->imm == BPF_FUNC_get_func_ip) {
-			/* Load IP address from ctx (struct pt_regs) ip */
-			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
-						  offsetof(struct pt_regs, ip));
-
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
-			if (!new_prog)
-				return -ENOMEM;
-
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-			continue;
-		}
-#endif
-
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9a7b6be655e4..52c2998e1dc3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1042,7 +1042,6 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 
 BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
 {
-	/* This helper call is inlined by verifier on x86. */
 	return instruction_pointer(regs);
 }
 
-- 
cgit v1.2.3


From f70986902c86f88612ed45a96aa7cf4caa65f7c1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 21 Mar 2022 08:01:13 +0100
Subject: bpf: Fix kprobe_multi return probe backtrace

Andrii reported that backtraces from kprobe_multi program attached
as return probes are not complete and showing just initial entry [1].

It's caused by changing registers to have original function ip address
as instruction pointer even for return probe, which will screw backtrace
from return probe.

This change keeps registers intact and store original entry ip and
link address on the stack in bpf_kprobe_multi_run_ctx struct, where
bpf_get_func_ip and bpf_get_attach_cookie helpers for kprobe_multi
programs can find it.

[1] https://lore.kernel.org/bpf/CAEf4BzZDDqK24rSKwXNp7XL3ErGD4bZa1M6c_c4EvDSt3jrZcg@mail.gmail.com/T/#m8d1301c0ea0892ddf9dc6fba57a57b8cf11b8c51

Fixes: ca74823c6e16 ("bpf: Add cookie support to programs attached with kprobe multi link")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220321070113.1449167-3-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 67 ++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 52c2998e1dc3..172ef545730d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -80,7 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
 				  u64 flags, const struct btf **btf,
 				  s32 *btf_id);
-static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -1042,7 +1043,7 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
 
 BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
 {
-	return instruction_pointer(regs);
+	return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
 }
 
 static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
@@ -1054,7 +1055,7 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
 
 BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
 {
-	return bpf_kprobe_multi_cookie(current->bpf_ctx, instruction_pointer(regs));
+	return bpf_kprobe_multi_cookie(current->bpf_ctx);
 }
 
 static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
@@ -2219,15 +2220,16 @@ struct bpf_kprobe_multi_link {
 	struct bpf_link link;
 	struct fprobe fp;
 	unsigned long *addrs;
-	/*
-	 * The run_ctx here is used to get struct bpf_kprobe_multi_link in
-	 * get_attach_cookie helper, so it can't be used to store data.
-	 */
-	struct bpf_run_ctx run_ctx;
 	u64 *cookies;
 	u32 cnt;
 };
 
+struct bpf_kprobe_multi_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	struct bpf_kprobe_multi_link *link;
+	unsigned long entry_ip;
+};
+
 static void bpf_kprobe_multi_link_release(struct bpf_link *link)
 {
 	struct bpf_kprobe_multi_link *kmulti_link;
@@ -2281,18 +2283,21 @@ static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void
 	return __bpf_kprobe_multi_cookie_cmp(a, b);
 }
 
-static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
 {
+	struct bpf_kprobe_multi_run_ctx *run_ctx;
 	struct bpf_kprobe_multi_link *link;
+	u64 *cookie, entry_ip;
 	unsigned long *addr;
-	u64 *cookie;
 
 	if (WARN_ON_ONCE(!ctx))
 		return 0;
-	link = container_of(ctx, struct bpf_kprobe_multi_link, run_ctx);
+	run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+	link = run_ctx->link;
 	if (!link->cookies)
 		return 0;
-	addr = bsearch(&ip, link->addrs, link->cnt, sizeof(ip),
+	entry_ip = run_ctx->entry_ip;
+	addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
 		       __bpf_kprobe_multi_cookie_cmp);
 	if (!addr)
 		return 0;
@@ -2300,10 +2305,22 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
 	return *cookie;
 }
 
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+	struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+	return run_ctx->entry_ip;
+}
+
 static int
 kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
-			   struct pt_regs *regs)
+			   unsigned long entry_ip, struct pt_regs *regs)
 {
+	struct bpf_kprobe_multi_run_ctx run_ctx = {
+		.link = link,
+		.entry_ip = entry_ip,
+	};
 	struct bpf_run_ctx *old_run_ctx;
 	int err;
 
@@ -2314,7 +2331,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 
 	migrate_disable();
 	rcu_read_lock();
-	old_run_ctx = bpf_set_run_ctx(&link->run_ctx);
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	err = bpf_prog_run(link->link.prog, regs);
 	bpf_reset_run_ctx(old_run_ctx);
 	rcu_read_unlock();
@@ -2329,24 +2346,10 @@ static void
 kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
 			  struct pt_regs *regs)
 {
-	unsigned long saved_ip = instruction_pointer(regs);
 	struct bpf_kprobe_multi_link *link;
 
-	/*
-	 * Because fprobe's regs->ip is set to the next instruction of
-	 * dynamic-ftrace instruction, correct entry ip must be set, so
-	 * that the bpf program can access entry address via regs as same
-	 * as kprobes.
-	 *
-	 * Both kprobe and kretprobe see the entry ip of traced function
-	 * as instruction pointer.
-	 */
-	instruction_pointer_set(regs, entry_ip);
-
 	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
-	kprobe_multi_link_prog_run(link, regs);
-
-	instruction_pointer_set(regs, saved_ip);
+	kprobe_multi_link_prog_run(link, entry_ip, regs);
 }
 
 static int
@@ -2513,7 +2516,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	return -EOPNOTSUPP;
 }
-static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx, u64 ip)
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+	return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 1b8ddbeeb9b819e62b7190115023ce858a159f5c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 12 Feb 2022 15:27:42 -0500
Subject: mm/truncate: Inline invalidate_complete_page() into its one caller

invalidate_inode_page() is the only caller of invalidate_complete_page()
and inlining it reveals that the first check is unnecessary (because we
hold the page locked, and we just retrieved the mapping from the page).
Actually, it does make a difference, in that tail pages no longer fail
at this check, so it's now possible to remove a tail page from a mapping.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 kernel/futex/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 51dd822a8060..b22ef1efe751 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -302,7 +302,7 @@ again:
 	 * found it, but truncated or holepunched or subjected to
 	 * invalidate_complete_page2 before we got the page lock (also
 	 * cases which we are happy to fail).  And we hold a reference,
-	 * so refcount care in invalidate_complete_page's remove_mapping
+	 * so refcount care in invalidate_inode_page's remove_mapping
 	 * prevents drop_caches from setting mapping to NULL beneath us.
 	 *
 	 * The case we do have to guard against is when memory pressure made
-- 
cgit v1.2.3


From eed05e54d275b3cfc5d8c79843c5276a5878e94a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 3 Feb 2022 09:06:08 -0500
Subject: mm: Add DEFINE_PAGE_VMA_WALK and DEFINE_FOLIO_VMA_WALK

Instead of declaring a struct page_vma_mapped_walk directly,
use these helpers to allow us to transition to a PFN approach in the
following patches.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 kernel/events/uprobes.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index eed2f7437d96..6418083901d4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 				struct page *old_page, struct page *new_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct page_vma_mapped_walk pvmw = {
-		.page = compound_head(old_page),
-		.vma = vma,
-		.address = addr,
-	};
+	DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
 	int err;
 	struct mmu_notifier_range range;
 
-- 
cgit v1.2.3


From d56c9fe6a06820d5ef8188d96bf4345c7bdba249 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 21 Mar 2022 11:58:02 -0700
Subject: bpf: Fix warning for cast from restricted gfp_t in verifier

This fixes the sparse warning reported by the kernel test robot:

kernel/bpf/verifier.c:13499:47: sparse: warning: cast from restricted gfp_t
kernel/bpf/verifier.c:13501:47: sparse: warning: cast from restricted gfp_t

This fix can be verified locally by running:
1) wget
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
-O make.cross

2) chmod +x ~/bin/make.cross

3) COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 ./make.cross
C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

Fixes: b00fa38a9c1c ("bpf: Enable non-atomic allocations in local storage")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220321185802.824223-1-joannekoong@fb.com
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 571ccd7f04eb..d175b70067b3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13496,9 +13496,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		    insn->imm == BPF_FUNC_sk_storage_get ||
 		    insn->imm == BPF_FUNC_inode_storage_get) {
 			if (env->prog->aux->sleepable)
-				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_KERNEL);
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
 			else
-				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_ATOMIC);
+				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
 			insn_buf[1] = *insn;
 			cnt = 2;
 
-- 
cgit v1.2.3


From 96805674e5624b3c79780a2b41c7a3d6bc38dc76 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 21 Mar 2022 11:00:08 -0700
Subject: bpf: Fix bpf_prog_pack for multi-node setup

module_alloc requires num_online_nodes * PMD_SIZE to allocate huge pages.
bpf_prog_pack uses pack of size num_online_nodes * PMD_SIZE.
OTOH, module_alloc returns addresses that are PMD_SIZE aligned (instead of
num_online_nodes * PMD_SIZE aligned). Therefore, PMD_MASK should be used
to calculate pack_ptr in bpf_prog_pack_free().

Fixes: ef078600eec2 ("bpf: Select proper size for bpf_prog_pack")
Reported-by: syzbot+c946805b5ce6ab87df0b@syzkaller.appspotmail.com
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220321180009.1944482-2-song@kernel.org
---
 kernel/bpf/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9d661e07e77c..f6b20fcbeb24 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -829,6 +829,7 @@ struct bpf_prog_pack {
 #define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
 
 static size_t bpf_prog_pack_size = -1;
+static size_t bpf_prog_pack_mask = -1;
 
 static int bpf_prog_chunk_count(void)
 {
@@ -850,8 +851,12 @@ static size_t select_bpf_prog_pack_size(void)
 	/* Test whether we can get huge pages. If not just use PAGE_SIZE
 	 * packs.
 	 */
-	if (!ptr || !is_vm_area_hugepages(ptr))
+	if (!ptr || !is_vm_area_hugepages(ptr)) {
 		size = PAGE_SIZE;
+		bpf_prog_pack_mask = PAGE_MASK;
+	} else {
+		bpf_prog_pack_mask = PMD_MASK;
+	}
 
 	vfree(ptr);
 	return size;
@@ -935,7 +940,7 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 		goto out;
 	}
 
-	pack_ptr = (void *)((unsigned long)hdr & ~(bpf_prog_pack_size - 1));
+	pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
 
 	list_for_each_entry(tmp, &pack_list, list) {
 		if (tmp->ptr == pack_ptr) {
-- 
cgit v1.2.3


From e581094167beb674c8a3bc2c27362f50dc5dd617 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 21 Mar 2022 11:00:09 -0700
Subject: bpf: Fix bpf_prog_pack when PMU_SIZE is not defined

PMD_SIZE is not available in some special config, e.g. ARCH=arm with
CONFIG_MMU=n. Use bpf_prog_pack of PAGE_SIZE in these cases.

Fixes: ef078600eec2 ("bpf: Select proper size for bpf_prog_pack")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220321180009.1944482-3-song@kernel.org
---
 kernel/bpf/core.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f6b20fcbeb24..13e9dbeeedf3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -840,12 +840,23 @@ static int bpf_prog_chunk_count(void)
 static DEFINE_MUTEX(pack_mutex);
 static LIST_HEAD(pack_list);
 
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_HPAGE_SIZE PMD_SIZE
+#define BPF_HPAGE_MASK PMD_MASK
+#else
+#define BPF_HPAGE_SIZE PAGE_SIZE
+#define BPF_HPAGE_MASK PAGE_MASK
+#endif
+
 static size_t select_bpf_prog_pack_size(void)
 {
 	size_t size;
 	void *ptr;
 
-	size = PMD_SIZE * num_online_nodes();
+	size = BPF_HPAGE_SIZE * num_online_nodes();
 	ptr = module_alloc(size);
 
 	/* Test whether we can get huge pages. If not just use PAGE_SIZE
@@ -855,7 +866,7 @@ static size_t select_bpf_prog_pack_size(void)
 		size = PAGE_SIZE;
 		bpf_prog_pack_mask = PAGE_MASK;
 	} else {
-		bpf_prog_pack_mask = PMD_MASK;
+		bpf_prog_pack_mask = BPF_HPAGE_MASK;
 	}
 
 	vfree(ptr);
-- 
cgit v1.2.3


From ab31c7fd2d37bc3580d9d712d5f2dfb69901fca9 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Tue, 22 Mar 2022 08:39:22 +0100
Subject: sched/numa: Fix boot crash on arm64 systems

Qian Cai reported a boot crash on arm64 systems, caused by:

  0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")

The bug is that node_state() must be supplied a valid node_states[] array index,
but in task_numa_placement() the max_nid search can fail with NUMA_NO_NODE,
which is not a valid index.

Fix it by checking that max_nid is a valid index.

[ mingo: Added changelog. ]

Fixes: 0fb3978b0aac ("sched/numa: Fix NUMA topology for systems with CPU-less nodes")
Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Tested-by: Qian Cai <quic_qiancai@quicinc.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89d21fda106c..ee0664c9d291 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2437,7 +2437,7 @@ static void task_numa_placement(struct task_struct *p)
 	}
 
 	/* Cannot migrate task to CPU-less node */
-	if (!node_state(max_nid, N_CPU)) {
+	if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
 		int near_nid = max_nid;
 		int distance, near_distance = INT_MAX;
 
-- 
cgit v1.2.3


From ee1fee900537b5d9560e9f937402de5ddc8412f3 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Sat, 19 Mar 2022 02:08:37 +0100
Subject: ptrace: Check PTRACE_O_SUSPEND_SECCOMP permission on PTRACE_SEIZE

Setting PTRACE_O_SUSPEND_SECCOMP is supposed to be a highly privileged
operation because it allows the tracee to completely bypass all seccomp
filters on kernels with CONFIG_CHECKPOINT_RESTORE=y. It is only supposed to
be settable by a process with global CAP_SYS_ADMIN, and only if that
process is not subject to any seccomp filters at all.

However, while these permission checks were done on the PTRACE_SETOPTIONS
path, they were missing on the PTRACE_SEIZE path, which also sets
user-specified ptrace flags.

Move the permissions checks out into a helper function and let both
ptrace_attach() and ptrace_setoptions() call it.

Cc: stable@kernel.org
Fixes: 13c4a90119d2 ("seccomp: add ptrace options for suspend/resume")
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://lkml.kernel.org/r/20220319010838.1386861-1-jannh@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..ccc4b465775b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -371,6 +371,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
+static int check_ptrace_options(unsigned long data)
+{
+	if (data & ~(unsigned long)PTRACE_O_MASK)
+		return -EINVAL;
+
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+		    !IS_ENABLED(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+	return 0;
+}
+
 static int ptrace_attach(struct task_struct *task, long request,
 			 unsigned long addr,
 			 unsigned long flags)
@@ -382,8 +402,16 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (seize) {
 		if (addr != 0)
 			goto out;
+		/*
+		 * This duplicates the check in check_ptrace_options() because
+		 * ptrace_attach() and ptrace_setoptions() have historically
+		 * used different error codes for unknown ptrace options.
+		 */
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
 			goto out;
+		retval = check_ptrace_options(flags);
+		if (retval)
+			return retval;
 		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
 	} else {
 		flags = PT_PTRACED;
@@ -654,22 +682,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
 	unsigned flags;
+	int ret;
 
-	if (data & ~(unsigned long)PTRACE_O_MASK)
-		return -EINVAL;
-
-	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
-		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
-		    !IS_ENABLED(CONFIG_SECCOMP))
-			return -EINVAL;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
-		    current->ptrace & PT_SUSPEND_SECCOMP)
-			return -EPERM;
-	}
+	ret = check_ptrace_options(data);
+	if (ret)
+		return ret;
 
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;
-- 
cgit v1.2.3


From e16faf26780fc0c8dd693ea9ee8420a7706cb2f5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 22 Mar 2022 14:43:17 -0700
Subject: cma: factor out minimum alignment requirement

Patch series "mm: enforce pageblock_order < MAX_ORDER".

Having pageblock_order >= MAX_ORDER seems to be able to happen in corner
cases and some parts of the kernel are not prepared for it.

For example, Aneesh has shown [1] that such kernels can be compiled on
ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will
run into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right
during boot.

We can get pageblock_order >= MAX_ORDER when the default hugetlb size is
bigger than the maximum allocation granularity of the buddy, in which
case we are no longer talking about huge pages but instead gigantic
pages.

Having pageblock_order >= MAX_ORDER can only make alloc_contig_range()
of such gigantic pages more likely to succeed.

Reliable use of gigantic pages either requires boot time allcoation or
CMA, no need to overcomplicate some places in the kernel to optimize for
corner cases that are broken in other areas of the kernel.

This patch (of 2):

Let's enforce pageblock_order < MAX_ORDER and simplify.

Especially patch #1 can be regarded a cleanup before:
	[PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range
	alignment. [2]

[1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com
[2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi.yan@sent.com

Link: https://lkml.kernel.org/r/20220214174132.219303-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: John Garry via iommu <iommu@lists.linux-foundation.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma/contiguous.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..6ea80ae42622 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
 
 static int __init rmem_cma_setup(struct reserved_mem *rmem)
 {
-	phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
-	phys_addr_t mask = align - 1;
 	unsigned long node = rmem->fdt_node;
 	bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
 	struct cma *cma;
@@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 	    of_get_flat_dt_prop(node, "no-map", NULL))
 		return -EINVAL;
 
-	if ((rmem->base & mask) || (rmem->size & mask)) {
+	if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
 		pr_err("Reserved memory: incorrect alignment of CMA region\n");
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From c574bbe917036c8968b984c82c7b13194fe5ce98 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 22 Mar 2022 14:46:23 -0700
Subject: NUMA balancing: optimize page placement for memory tiering system

With the advent of various new memory types, some machines will have
multiple types of memory, e.g.  DRAM and PMEM (persistent memory).  The
memory subsystem of these machines can be called memory tiering system,
because the performance of the different types of memory are usually
different.

In such system, because of the memory accessing pattern changing etc,
some pages in the slow memory may become hot globally.  So in this
patch, the NUMA balancing mechanism is enhanced to optimize the page
placement among the different memory types according to hot/cold
dynamically.

In a typical memory tiering system, there are CPUs, fast memory and slow
memory in each physical NUMA node.  The CPUs and the fast memory will be
put in one logical node (called fast memory node), while the slow memory
will be put in another (faked) logical node (called slow memory node).
That is, the fast memory is regarded as local while the slow memory is
regarded as remote.  So it's possible for the recently accessed pages in
the slow memory node to be promoted to the fast memory node via the
existing NUMA balancing mechanism.

The original NUMA balancing mechanism will stop to migrate pages if the
free memory of the target node becomes below the high watermark.  This
is a reasonable policy if there's only one memory type.  But this makes
the original NUMA balancing mechanism almost do not work to optimize
page placement among different memory types.  Details are as follows.

It's the common cases that the working-set size of the workload is
larger than the size of the fast memory nodes.  Otherwise, it's
unnecessary to use the slow memory at all.  So, there are almost always
no enough free pages in the fast memory nodes, so that the globally hot
pages in the slow memory node cannot be promoted to the fast memory
node.  To solve the issue, we have 2 choices as follows,

a. Ignore the free pages watermark checking when promoting hot pages
   from the slow memory node to the fast memory node.  This will
   create some memory pressure in the fast memory node, thus trigger
   the memory reclaiming.  So that, the cold pages in the fast memory
   node will be demoted to the slow memory node.

b. Define a new watermark called wmark_promo which is higher than
   wmark_high, and have kswapd reclaiming pages until free pages reach
   such watermark.  The scenario is as follows: when we want to promote
   hot-pages from a slow memory to a fast memory, but fast memory's free
   pages would go lower than high watermark with such promotion, we wake
   up kswapd with wmark_promo watermark in order to demote cold pages and
   free us up some space.  So, next time we want to promote hot-pages we
   might have a chance of doing so.

The choice "a" may create high memory pressure in the fast memory node.
If the memory pressure of the workload is high, the memory pressure
may become so high that the memory allocation latency of the workload
is influenced, e.g.  the direct reclaiming may be triggered.

The choice "b" works much better at this aspect.  If the memory
pressure of the workload is high, the hot pages promotion will stop
earlier because its allocation watermark is higher than that of the
normal memory allocation.  So in this patch, choice "b" is implemented.
A new zone watermark (WMARK_PROMO) is added.  Which is larger than the
high watermark and can be controlled via watermark_scale_factor.

In addition to the original page placement optimization among sockets,
the NUMA balancing mechanism is extended to be used to optimize page
placement according to hot/cold among different memory types.  So the
sysctl user space interface (numa_balancing) is extended in a backward
compatible way as follow, so that the users can enable/disable these
functionality individually.

The sysctl is converted from a Boolean value to a bits field.  The
definition of the flags is,

- 0: NUMA_BALANCING_DISABLED
- 1: NUMA_BALANCING_NORMAL
- 2: NUMA_BALANCING_MEMORY_TIERING

We have tested the patch with the pmbench memory accessing benchmark
with the 80:20 read/write ratio and the Gauss access address
distribution on a 2 socket Intel server with Optane DC Persistent
Memory Model.  The test results shows that the pmbench score can
improve up to 95.9%.

Thanks Andrew Morton to help fix the document format error.

Link: https://lkml.kernel.org/r/20220221084529.1052339-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: zhongjiang-ali <zhongjiang-ali@linux.alibaba.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Feng Tang <feng.tang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c | 21 +++++++++++++++++----
 kernel/sysctl.c     |  2 +-
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..da6a60383645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4279,7 +4279,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
 
 #ifdef CONFIG_NUMA_BALANCING
 
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
 {
 	if (enabled)
 		static_branch_enable(&sched_numa_balancing);
@@ -4287,13 +4289,22 @@ void set_numabalancing_state(bool enabled)
 		static_branch_disable(&sched_numa_balancing);
 }
 
+void set_numabalancing_state(bool enabled)
+{
+	if (enabled)
+		sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+	else
+		sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+	__set_numabalancing_state(enabled);
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	int err;
-	int state = static_branch_likely(&sched_numa_balancing);
+	int state = sysctl_numa_balancing_mode;
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -4303,8 +4314,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
-	if (write)
-		set_numabalancing_state(state);
+	if (write) {
+		sysctl_numa_balancing_mode = state;
+		__set_numabalancing_state(state);
+	}
 	return err;
 }
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 730ab56d9e92..3395b99d59a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1696,7 +1696,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sysctl_numa_balancing,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 	{
-- 
cgit v1.2.3


From eca344a7362e0f34f179298fd8366bcd556eede1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 23 Mar 2022 10:32:51 -0400
Subject: tracing: Have trace event string test handle zero length strings

If a trace event has in its TP_printk():

 "%*.s", len, len ? __get_str(string) : NULL

It is perfectly valid if len is zero and passing in the NULL.
Unfortunately, the runtime string check at time of reading the trace sees
the NULL and flags it as a bad string and produces a WARN_ON().

Handle this case by passing into the test function if the format has an
asterisk (star) and if so, if the length is zero, then mark it as safe.

Link: https://lore.kernel.org/all/YjsWzuw5FbWPrdqq@bfoster/

Cc: stable@vger.kernel.org
Reported-by: Brian Foster <bfoster@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb44418574f9..96265a717ca4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3663,12 +3663,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
 }
 
 /* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+			   bool star, int len)
 {
 	unsigned long addr = (unsigned long)str;
 	struct trace_event *trace_event;
 	struct trace_event_call *event;
 
+	/* Ignore strings with no length */
+	if (star && !len)
+		return true;
+
 	/* OK if part of the event data */
 	if ((addr >= (unsigned long)iter->ent) &&
 	    (addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3854,7 +3859,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
 		 * instead. See samples/trace_events/trace-events-sample.h
 		 * for reference.
 		 */
-		if (WARN_ONCE(!trace_safe_str(iter, str),
+		if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
 			      "fmt: '%s' current_buffer: '%s'",
 			      fmt, show_buffer(&iter->seq))) {
 			int ret;
-- 
cgit v1.2.3


From a7cd9a5376aa188529393a802329645dc8c5a63c Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 23 Mar 2022 16:05:35 -0700
Subject: kernel/ksysfs.c: use helper macro __ATTR_RW

Use helper macro __ATTR_RW to define kobj_attribute to make code more
clear.  Minor readability improvement.

Link: https://lkml.kernel.org/r/20220222112034.48298-1-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..b1292a57c2a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,8 +24,7 @@
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
 #define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
-	__ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
 
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
-- 
cgit v1.2.3


From b1e2c8df0f007f34c373925471c7b493c9e16620 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 23 Mar 2022 16:06:29 -0700
Subject: cgroup: use irqsave in cgroup_rstat_flush_locked().

All callers of cgroup_rstat_flush_locked() acquire cgroup_rstat_lock
either with spin_lock_irq() or spin_lock_irqsave().
cgroup_rstat_flush_locked() itself acquires cgroup_rstat_cpu_lock which
is a raw_spin_lock.  This lock is also acquired in
cgroup_rstat_updated() in IRQ context and therefore requires _irqsave()
locking suffix in cgroup_rstat_flush_locked().

Since there is no difference between spin_lock_t and raw_spin_lock_t on
!RT lockdep does not complain here.  On RT lockdep complains because the
interrupts were not disabled here and a deadlock is possible.

Acquire the raw_spin_lock_t with disabled interrupts.

Link: https://lkml.kernel.org/r/20220301122143.1521823-2-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zefan Li <lizefan.x@bytedance.com>
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Subject: cgroup: add a comment to cgroup_rstat_flush_locked().

Add a comment why spin_lock_irq() -> raw_spin_lock_irqsave() is needed.

Link: https://lkml.kernel.org/r/Yh+DOK73hfVV5ThX@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/rstat.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..ba7a660184e4 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 						       cpu);
 		struct cgroup *pos = NULL;
+		unsigned long flags;
 
-		raw_spin_lock(cpu_lock);
+		/*
+		 * The _irqsave() is needed because cgroup_rstat_lock is
+		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+		 * this lock with the _irq() suffix only disables interrupts on
+		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+		 * interrupts on both configurations. The _irqsave() ensures
+		 * that interrupts are always disabled and later restored.
+		 */
+		raw_spin_lock_irqsave(cpu_lock, flags);
 		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 			struct cgroup_subsys_state *css;
 
@@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 				css->ss->css_rstat_flush(css, cpu);
 			rcu_read_unlock();
 		}
-		raw_spin_unlock(cpu_lock);
+		raw_spin_unlock_irqrestore(cpu_lock, flags);
 
 		/* if @may_sleep, play nice and yield if necessary */
 		if (may_sleep && (need_resched() ||
-- 
cgit v1.2.3


From 1a2383e8b84c0451fd9b1eec3b9aab16f30b597c Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Wed, 23 Mar 2022 16:06:51 -0700
Subject: panic: unset panic_on_warn inside panic()

In the current code, the following three places need to unset
panic_on_warn before calling panic() to avoid recursive panics:

kernel/kcsan/report.c: print_report()
kernel/sched/core.c: __schedule_bug()
mm/kfence/report.c: kfence_report_error()

In order to avoid copy-pasting "panic_on_warn = 0" all over the places,
it is better to move it inside panic() and then remove it from the other
places.

Link: https://lkml.kernel.org/r/1644324666-15947-4-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Xuefeng Li <lixuefeng@loongson.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 55b50e052ec3..95ba825522dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -185,6 +185,16 @@ void panic(const char *fmt, ...)
 	int old_cpu, this_cpu;
 	bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
 
+	if (panic_on_warn) {
+		/*
+		 * This thread may hit another WARN() in the panic path.
+		 * Resetting this prevents additional WARN() from panicking the
+		 * system on this thread.  Other threads are blocked by the
+		 * panic_mutex in panic().
+		 */
+		panic_on_warn = 0;
+	}
+
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
 	 * from deadlocking the first cpu that invokes the panic, since
@@ -576,16 +586,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	if (regs)
 		show_regs(regs);
 
-	if (panic_on_warn) {
-		/*
-		 * This thread may hit another WARN() in the panic path.
-		 * Resetting this prevents additional WARN() from panicking the
-		 * system on this thread.  Other threads are blocked by the
-		 * panic_mutex in panic().
-		 */
-		panic_on_warn = 0;
+	if (panic_on_warn)
 		panic("panic_on_warn set ...\n");
-	}
 
 	if (!regs)
 		dump_stack();
-- 
cgit v1.2.3


From 92333baaceb399c7878c7f868a0746d7d22f9b77 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 23 Mar 2022 16:07:00 -0700
Subject: taskstats: remove unneeded dead assignment

make clang-analyzer on x86_64 defconfig caught my attention with:

  kernel/taskstats.c:120:2: warning: Value stored to 'rc' is never read \
  [clang-analyzer-deadcode.DeadStores]
          rc = 0;
          ^

Commit d94a041519f3 ("taskstats: free skb, avoid returns in
send_cpu_listeners") made send_cpu_listeners() not return a value and
hence, the rc variable remained only to be used within the loop where
it is always assigned before read and it does not need any other
initialisation.

So, simply remove this unneeded dead initializing assignment.

As compilers will detect this unneeded assignment and optimize this anyway,
the resulting object code is identical before and after this change.

No functional change. No change to object code.

[akpm@linux-foundation.org: reduce scope of `rc']

Link: https://lkml.kernel.org/r/20220307093942.21310-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Tom Rix <trix@redhat.com>
Cc: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2b4898b4752e..bcac5a9043aa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -113,13 +113,14 @@ static void send_cpu_listeners(struct sk_buff *skb,
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, delcount = 0;
+	int delcount = 0;
 
 	genlmsg_end(skb, reply);
 
-	rc = 0;
 	down_read(&listeners->sem);
 	list_for_each_entry(s, &listeners->list, list) {
+		int rc;
+
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
 			skb_next = skb_clone(skb_cur, GFP_KERNEL);
-- 
cgit v1.2.3


From 8d470a45d1a65e6a308aeee5da7f5b37d3303c04 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Wed, 23 Mar 2022 16:07:06 -0700
Subject: panic: add option to dump all CPUs backtraces in panic_print

Currently the "panic_print" parameter/sysctl allows some interesting debug
information to be printed during a panic event.  This is useful for
example in cases the user cannot kdump due to resource limits, or if the
user collects panic logs in a serial output (or pstore) and prefers a fast
reboot instead of a kdump.

Happens that currently there's no way to see all CPUs backtraces in a
panic using "panic_print" on architectures that support that.  We do have
"oops_all_cpu_backtrace" sysctl, but although partially overlapping in the
functionality, they are orthogonal in nature: "panic_print" is a panic
tuning (and we have panics without oopses, like direct calls to panic() or
maybe other paths that don't go through oops_enter() function), and the
original purpose of "oops_all_cpu_backtrace" is to provide more
information on oopses for cases in which the users desire to continue
running the kernel even after an oops, i.e., used in non-panic scenarios.

So, we hereby introduce an additional bit for "panic_print" to allow
dumping the CPUs backtraces during a panic event.

Link: https://lkml.kernel.org/r/20211109202848.610874-3-gpiccoli@igalia.com
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 95ba825522dd..3c3fb36d8d41 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
 #define PANIC_PRINT_LOCK_INFO		0x00000008
 #define PANIC_PRINT_FTRACE_INFO		0x00000010
 #define PANIC_PRINT_ALL_PRINTK_MSG	0x00000020
+#define PANIC_PRINT_ALL_CPU_BT		0x00000040
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -152,6 +153,9 @@ static void panic_print_sys_info(void)
 	if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
 		console_flush_on_panic(CONSOLE_REPLAY_ALL);
 
+	if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+		trigger_all_cpu_backtrace();
+
 	if (panic_print & PANIC_PRINT_TASK_INFO)
 		show_state();
 
-- 
cgit v1.2.3


From f953f140f318a641c443b0b8c618155ed90a7a10 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Wed, 23 Mar 2022 16:07:09 -0700
Subject: panic: move panic_print before kmsg dumpers

The panic_print setting allows users to collect more information in a
panic event, like memory stats, tasks, CPUs backtraces, etc.  This is an
interesting debug mechanism, but currently the print event happens *after*
kmsg_dump(), meaning that pstore, for example, cannot collect a dmesg with
the panic_print extra information.

This patch changes that in 2 steps:

(a) The panic_print setting allows to replay the existing kernel log
    buffer to the console (bit 5), besides the extra information dump.
    This functionality makes sense only at the end of the panic()
    function.  So, we hereby allow to distinguish the two situations by a
    new boolean parameter in the function panic_print_sys_info().

(b) With the above change, we can safely call panic_print_sys_info()
    before kmsg_dump(), allowing to dump the extra information when using
    pstore or other kmsg dumpers.

The additional messages from panic_print could overwrite the oldest
messages when the buffer is full.  The only reasonable solution is to use
a large enough log buffer, hence we added an advice into the kernel
parameters documentation about that.

Link: https://lkml.kernel.org/r/20220214141308.841525-1-gpiccoli@igalia.com
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Feng Tang <feng.tang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 3c3fb36d8d41..eb4dfb932c85 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -148,10 +148,13 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
 }
 EXPORT_SYMBOL(nmi_panic);
 
-static void panic_print_sys_info(void)
+static void panic_print_sys_info(bool console_flush)
 {
-	if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
-		console_flush_on_panic(CONSOLE_REPLAY_ALL);
+	if (console_flush) {
+		if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+			console_flush_on_panic(CONSOLE_REPLAY_ALL);
+		return;
+	}
 
 	if (panic_print & PANIC_PRINT_ALL_CPU_BT)
 		trigger_all_cpu_backtrace();
@@ -286,6 +289,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	panic_print_sys_info(false);
+
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
@@ -316,7 +321,7 @@ void panic(const char *fmt, ...)
 	debug_locks_off();
 	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
 
-	panic_print_sys_info();
+	panic_print_sys_info(true);
 
 	if (!panic_blink)
 		panic_blink = no_blink;
-- 
cgit v1.2.3


From 17581aa13680e1c38759cbb0ed32d04aa8849bea Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Wed, 23 Mar 2022 16:07:12 -0700
Subject: kcov: split ioctl handling into locked and unlocked parts

Patch series "kcov: improve mmap processing", v3.

Subsequent mmaps of the same kcov descriptor currently do not update the
virtual memory of the task and yet return 0 (success).  This is
counter-intuitive and may lead to unexpected memory access errors.

Also, this unnecessarily limits the functionality of kcov to only the
simplest usage scenarios.  Kcov instances are effectively forever attached
to their first address spaces and it becomes impossible to e.g.  reuse the
same kcov handle in forked child processes without mmapping the memory
first.  This is exactly what we tried to do in syzkaller and inadvertently
came upon this behavior.

This patch series addresses the problem described above.

This patch (of 3):

Currently all ioctls are de facto processed under a spinlock in order to
serialise them.  This, however, prohibits the use of vmalloc and other
memory management functions in the implementations of those ioctls,
unnecessary complicating any further changes to the code.

Let all ioctls first be processed inside the kcov_ioctl() function which
should execute the ones that are not compatible with spinlock and then
pass control to kcov_ioctl_locked() for all other ones.
KCOV_REMOTE_ENABLE is processed both in kcov_ioctl() and
kcov_ioctl_locked() as the steps are easily separable.

Although it is still compatible with a spinlock, move KCOV_INIT_TRACE
handling to kcov_ioctl(), so that the changes from the next commit are
easier to follow.

Link: https://lkml.kernel.org/r/20220117153634.150357-1-nogikh@google.com
Link: https://lkml.kernel.org/r/20220117153634.150357-2-nogikh@google.com
Signed-off-by: Aleksandr Nogikh <nogikh@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Taras Madan <tarasmadan@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 68 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e..e1be7301500b 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -564,31 +564,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			     unsigned long arg)
 {
 	struct task_struct *t;
-	unsigned long size, unused;
+	unsigned long flags, unused;
 	int mode, i;
 	struct kcov_remote_arg *remote_arg;
 	struct kcov_remote *remote;
-	unsigned long flags;
 
 	switch (cmd) {
-	case KCOV_INIT_TRACE:
-		/*
-		 * Enable kcov in trace mode and setup buffer size.
-		 * Must happen before anything else.
-		 */
-		if (kcov->mode != KCOV_MODE_DISABLED)
-			return -EBUSY;
-		/*
-		 * Size must be at least 2 to hold current position and one PC.
-		 * Later we allocate size * sizeof(unsigned long) memory,
-		 * that must not overflow.
-		 */
-		size = arg;
-		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
-			return -EINVAL;
-		kcov->size = size;
-		kcov->mode = KCOV_MODE_INIT;
-		return 0;
 	case KCOV_ENABLE:
 		/*
 		 * Enable coverage for the current task.
@@ -692,9 +673,32 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	struct kcov_remote_arg *remote_arg = NULL;
 	unsigned int remote_num_handles;
 	unsigned long remote_arg_size;
-	unsigned long flags;
+	unsigned long size, flags;
 
-	if (cmd == KCOV_REMOTE_ENABLE) {
+	kcov = filep->private_data;
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 *
+		 * First check the size argument - it must be at least 2
+		 * to hold the current position and one PC. Later we allocate
+		 * size * sizeof(unsigned long) memory, that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		spin_lock_irqsave(&kcov->lock, flags);
+		if (kcov->mode != KCOV_MODE_DISABLED) {
+			spin_unlock_irqrestore(&kcov->lock, flags);
+			return -EBUSY;
+		}
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_INIT;
+		spin_unlock_irqrestore(&kcov->lock, flags);
+		return 0;
+	case KCOV_REMOTE_ENABLE:
 		if (get_user(remote_num_handles, (unsigned __user *)(arg +
 				offsetof(struct kcov_remote_arg, num_handles))))
 			return -EFAULT;
@@ -710,16 +714,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 		}
 		arg = (unsigned long)remote_arg;
+		fallthrough;
+	default:
+		/*
+		 * All other commands can be normally executed under a spin lock, so we
+		 * obtain and release it here in order to simplify kcov_ioctl_locked().
+		 */
+		spin_lock_irqsave(&kcov->lock, flags);
+		res = kcov_ioctl_locked(kcov, cmd, arg);
+		spin_unlock_irqrestore(&kcov->lock, flags);
+		kfree(remote_arg);
+		return res;
 	}
-
-	kcov = filep->private_data;
-	spin_lock_irqsave(&kcov->lock, flags);
-	res = kcov_ioctl_locked(kcov, cmd, arg);
-	spin_unlock_irqrestore(&kcov->lock, flags);
-
-	kfree(remote_arg);
-
-	return res;
 }
 
 static const struct file_operations kcov_fops = {
-- 
cgit v1.2.3


From b3d7fe86fbd06638c71dd851ba921adf50d912ce Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Wed, 23 Mar 2022 16:07:15 -0700
Subject: kcov: properly handle subsequent mmap calls

Allocate the kcov buffer during KCOV_MODE_INIT in order to untie mmapping
of a kcov instance and the actual coverage collection process. Modify
kcov_mmap, so that it can be reliably used any number of times once
KCOV_MODE_INIT has succeeded.

These changes to the user-facing interface of the tool only weaken the
preconditions, so all existing user space code should remain compatible
with the new version.

Link: https://lkml.kernel.org/r/20220117153634.150357-3-nogikh@google.com
Signed-off-by: Aleksandr Nogikh <nogikh@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Taras Madan <tarasmadan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index e1be7301500b..475524bd900a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t)
 static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 {
 	int res = 0;
-	void *area;
 	struct kcov *kcov = vma->vm_file->private_data;
 	unsigned long size, off;
 	struct page *page;
 	unsigned long flags;
 
-	area = vmalloc_user(vma->vm_end - vma->vm_start);
-	if (!area)
-		return -ENOMEM;
-
 	spin_lock_irqsave(&kcov->lock, flags);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+	if (kcov->area == NULL || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
 	}
-	if (!kcov->area) {
-		kcov->area = area;
-		vma->vm_flags |= VM_DONTEXPAND;
-		spin_unlock_irqrestore(&kcov->lock, flags);
-		for (off = 0; off < size; off += PAGE_SIZE) {
-			page = vmalloc_to_page(kcov->area + off);
-			if (vm_insert_page(vma, vma->vm_start + off, page))
-				WARN_ONCE(1, "vm_insert_page() failed");
-		}
-		return 0;
+	spin_unlock_irqrestore(&kcov->lock, flags);
+	vma->vm_flags |= VM_DONTEXPAND;
+	for (off = 0; off < size; off += PAGE_SIZE) {
+		page = vmalloc_to_page(kcov->area + off);
+		if (vm_insert_page(vma, vma->vm_start + off, page))
+			WARN_ONCE(1, "vm_insert_page() failed");
 	}
+	return 0;
 exit:
 	spin_unlock_irqrestore(&kcov->lock, flags);
-	vfree(area);
 	return res;
 }
 
@@ -674,6 +665,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	unsigned int remote_num_handles;
 	unsigned long remote_arg_size;
 	unsigned long size, flags;
+	void *area;
 
 	kcov = filep->private_data;
 	switch (cmd) {
@@ -683,17 +675,21 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 		 * Must happen before anything else.
 		 *
 		 * First check the size argument - it must be at least 2
-		 * to hold the current position and one PC. Later we allocate
-		 * size * sizeof(unsigned long) memory, that must not overflow.
+		 * to hold the current position and one PC.
 		 */
 		size = arg;
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
+		area = vmalloc_user(size * sizeof(unsigned long));
+		if (area == NULL)
+			return -ENOMEM;
 		spin_lock_irqsave(&kcov->lock, flags);
 		if (kcov->mode != KCOV_MODE_DISABLED) {
 			spin_unlock_irqrestore(&kcov->lock, flags);
+			vfree(area);
 			return -EBUSY;
 		}
+		kcov->area = area;
 		kcov->size = size;
 		kcov->mode = KCOV_MODE_INIT;
 		spin_unlock_irqrestore(&kcov->lock, flags);
-- 
cgit v1.2.3


From 0cbcc92917c5de80f15c24d033566539ad696892 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 23 Mar 2022 16:07:18 -0700
Subject: kernel/resource: fix kfree() of bootmem memory again

Since commit ebff7d8f270d ("mem hotunplug: fix kfree() of bootmem
memory"), we could get a resource allocated during boot via
alloc_resource().  And it's required to release the resource using
free_resource().  Howerver, many people use kfree directly which will
result in kernel BUG.  In order to fix this without fixing every call
site, just leak a couple of bytes in such corner case.

Link: https://lkml.kernel.org/r/20220217083619.19305-1-linmiaohe@huawei.com
Fixes: ebff7d8f270d ("mem hotunplug: fix kfree() of bootmem memory")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 9c08d6e9eef2..34eaee179689 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,14 +56,6 @@ struct resource_constraint {
 
 static DEFINE_RWLOCK(resource_lock);
 
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
 static struct resource *next_resource(struct resource *p)
 {
 	if (p->child)
@@ -160,36 +152,19 @@ __initcall(ioresources_init);
 
 static void free_resource(struct resource *res)
 {
-	if (!res)
-		return;
-
-	if (!PageSlab(virt_to_head_page(res))) {
-		spin_lock(&bootmem_resource_lock);
-		res->sibling = bootmem_resource_free;
-		bootmem_resource_free = res;
-		spin_unlock(&bootmem_resource_lock);
-	} else {
+	/**
+	 * If the resource was allocated using memblock early during boot
+	 * we'll leak it here: we can only return full pages back to the
+	 * buddy and trying to be smart and reusing them eventually in
+	 * alloc_resource() overcomplicates resource handling.
+	 */
+	if (res && PageSlab(virt_to_head_page(res)))
 		kfree(res);
-	}
 }
 
 static struct resource *alloc_resource(gfp_t flags)
 {
-	struct resource *res = NULL;
-
-	spin_lock(&bootmem_resource_lock);
-	if (bootmem_resource_free) {
-		res = bootmem_resource_free;
-		bootmem_resource_free = res->sibling;
-	}
-	spin_unlock(&bootmem_resource_lock);
-
-	if (res)
-		memset(res, 0, sizeof(struct resource));
-	else
-		res = kzalloc(sizeof(struct resource), flags);
-
-	return res;
+	return kzalloc(sizeof(struct resource), flags);
 }
 
 /* Return the conflict entry if you can't request it */
-- 
cgit v1.2.3


From c1cb81429df462eca1b6ba615cddd21dd3103c46 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 28 Jan 2022 14:40:55 +0000
Subject: kdb: Fix the putarea helper function

Currently kdb_putarea_size() uses copy_from_kernel_nofault() to write *to*
arbitrary kernel memory. This is obviously wrong and means the memory
modify ('mm') command is a serious risk to debugger stability: if we poke
to a bad address we'll double-fault and lose our debug session.

Fix this the (very) obvious way.

Note that there are two Fixes: tags because the API was renamed and this
patch will only trivially backport as far as the rename (and this is
probably enough). Nevertheless Christoph's rename did not introduce this
problem so I wanted to record that!

Fixes: fe557319aa06 ("maccess: rename probe_kernel_{read,write} to copy_{from,to}_kernel_nofault")
Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)")
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20220128144055.207267-1-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_support.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..85cb51c4a17e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
  */
 int kdb_putarea_size(unsigned long addr, void *res, size_t size)
 {
-	int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+	int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
 	if (ret) {
 		if (!KDB_STATE(SUPPRESS)) {
 			kdb_func_printf("Bad address 0x%lx\n", addr);
-- 
cgit v1.2.3


From c08e6a1206e6876c66e0528b3ec717f557b07dd4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 24 Mar 2022 18:11:07 -0700
Subject: kasan, fork: reset pointer tags of vmapped stacks

Once tag-based KASAN modes start tagging vmalloc() allocations, kernel
stacks start getting tagged if CONFIG_VMAP_STACK is enabled.

Reset the tag of kernel stack pointers after allocation in
alloc_thread_stack_node().

For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation
can't handle the SP register being tagged.

For HW_TAGS KASAN, there's no instrumentation-related issues.  However,
the impact of having a tagged SP register needs to be properly evaluated,
so keep it non-tagged for now.

Note, that the memory for the stack allocation still gets tagged to catch
vmalloc-into-stack out-of-bounds accesses.

Link: https://lkml.kernel.org/r/c6c96f012371ecd80e1936509ebcd3b07a5956f7.1643047180.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 834af51ed397..ce0ffe48937f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -326,6 +326,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * so cache the vm_struct.
 	 */
 	tsk->stack_vm_area = vm;
+	if (stack)
+		stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
 	return 0;
 }
-- 
cgit v1.2.3


From 51fb34de2a4c8fa0f221246313700bfe3b6c586d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 24 Mar 2022 18:11:10 -0700
Subject: kasan, arm64: reset pointer tags of vmapped stacks

Once tag-based KASAN modes start tagging vmalloc() allocations, kernel
stacks start getting tagged if CONFIG_VMAP_STACK is enabled.

Reset the tag of kernel stack pointers after allocation in
arch_alloc_vmap_stack().

For SW_TAGS KASAN, when CONFIG_KASAN_STACK is enabled, the instrumentation
can't handle the SP register being tagged.

For HW_TAGS KASAN, there's no instrumentation-related issues.  However,
the impact of having a tagged SP register needs to be properly evaluated,
so keep it non-tagged for now.

Note, that the memory for the stack allocation still gets tagged to catch
vmalloc-into-stack out-of-bounds accesses.

[andreyknvl@google.com: fix case when a stack is retrieved from cached_stacks]
  Link: https://lkml.kernel.org/r/f50c5f96ef896d7936192c888b0c0a7674e33184.1644943792.git.andreyknvl@google.com
[dan.carpenter@oracle.com: remove unnecessary check in alloc_thread_stack_node()]
  Link: https://lkml.kernel.org/r/20220301080706.GB17208@kili

Link: https://lkml.kernel.org/r/698c5ab21743c796d46c15d075b9481825973e34.1643047180.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marco Elver <elver@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ce0ffe48937f..9796897560ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,11 +286,13 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-		/* Mark stack accessible for KASAN. */
+		/* Reset stack metadata. */
 		kasan_unpoison_range(s->addr, THREAD_SIZE);
 
+		stack = kasan_reset_tag(s->addr);
+
 		/* Clear stale pointers from reused stack. */
-		memset(s->addr, 0, THREAD_SIZE);
+		memset(stack, 0, THREAD_SIZE);
 
 		if (memcg_charge_kernel_stack(s)) {
 			vfree(s->addr);
@@ -298,7 +300,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 		}
 
 		tsk->stack_vm_area = s;
-		tsk->stack = s->addr;
+		tsk->stack = stack;
 		return 0;
 	}
 
@@ -326,8 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 	 * so cache the vm_struct.
 	 */
 	tsk->stack_vm_area = vm;
-	if (stack)
-		stack = kasan_reset_tag(stack);
+	stack = kasan_reset_tag(stack);
 	tsk->stack = stack;
 	return 0;
 }
-- 
cgit v1.2.3


From 23689e91fb22c15b84ac6c22ad9942039792f3af Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 24 Mar 2022 18:11:32 -0700
Subject: kasan, vmalloc: add vmalloc tagging for HW_TAGS

Add vmalloc tagging support to HW_TAGS KASAN.

The key difference between HW_TAGS and the other two KASAN modes when it
comes to vmalloc: HW_TAGS KASAN can only assign tags to physical memory.
The other two modes have shadow memory covering every mapped virtual
memory region.

Make __kasan_unpoison_vmalloc() for HW_TAGS KASAN:

 - Skip non-VM_ALLOC mappings as HW_TAGS KASAN can only tag a single
   mapping of normal physical memory; see the comment in the function.

 - Generate a random tag, tag the returned pointer and the allocation,
   and initialize the allocation at the same time.

 - Propagate the tag into the page stucts to allow accesses through
   page_address(vmalloc_to_page()).

The rest of vmalloc-related KASAN hooks are not needed:

 - The shadow-related ones are fully skipped.

 - __kasan_poison_vmalloc() is kept as a no-op with a comment.

Poisoning and zeroing of physical pages that are backing vmalloc()
allocations are skipped via __GFP_SKIP_KASAN_UNPOISON and
__GFP_SKIP_ZERO: __kasan_unpoison_vmalloc() does that instead.

Enabling CONFIG_KASAN_VMALLOC with HW_TAGS is not yet allowed.

Link: https://lkml.kernel.org/r/d19b2e9e59a9abc59d05b72dea8429dcaea739c6.1643047180.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Co-developed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/scs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scs.c b/kernel/scs.c
index 579841be8864..b83bc9251f99 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -32,7 +32,7 @@ static void *__scs_alloc(int node)
 	for (i = 0; i < NR_CACHED_SCS; i++) {
 		s = this_cpu_xchg(scs_cache[i], NULL);
 		if (s) {
-			kasan_unpoison_vmalloc(s, SCS_SIZE);
+			kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
 			memset(s, 0, SCS_SIZE);
 			return s;
 		}
@@ -78,7 +78,7 @@ void scs_free(void *s)
 		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
 			return;
 
-	kasan_unpoison_vmalloc(s, SCS_SIZE);
+	kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
 	vfree_atomic(s);
 }
 
-- 
cgit v1.2.3


From f6e39794f4b6da7ca9b77f2f9ad11fd6f0ac83e5 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 24 Mar 2022 18:11:35 -0700
Subject: kasan, vmalloc: only tag normal vmalloc allocations

The kernel can use to allocate executable memory.  The only supported
way to do that is via __vmalloc_node_range() with the executable bit set
in the prot argument.  (vmap() resets the bit via pgprot_nx()).

Once tag-based KASAN modes start tagging vmalloc allocations, executing
code from such allocations will lead to the PC register getting a tag,
which is not tolerated by the kernel.

Only tag the allocations for normal kernel pages.

[andreyknvl@google.com: pass KASAN_VMALLOC_PROT_NORMAL to kasan_unpoison_vmalloc()]
  Link: https://lkml.kernel.org/r/9230ca3d3e40ffca041c133a524191fd71969a8d.1646233925.git.andreyknvl@google.com
[andreyknvl@google.com: support tagged vmalloc mappings]
  Link: https://lkml.kernel.org/r/2f6605e3a358cf64d73a05710cb3da356886ad29.1646233925.git.andreyknvl@google.com
[andreyknvl@google.com: don't unintentionally disabled poisoning]
  Link: https://lkml.kernel.org/r/de4587d6a719232e83c760113e46ed2d4d8da61e.1646757322.git.andreyknvl@google.com

Link: https://lkml.kernel.org/r/fbfd9939a4dc375923c9a5c6b9e7ab05c26b8c6b.1643047180.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/scs.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scs.c b/kernel/scs.c
index b83bc9251f99..b7e1b096d906 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -32,15 +32,19 @@ static void *__scs_alloc(int node)
 	for (i = 0; i < NR_CACHED_SCS; i++) {
 		s = this_cpu_xchg(scs_cache[i], NULL);
 		if (s) {
-			kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
+			s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+						   KASAN_VMALLOC_PROT_NORMAL);
 			memset(s, 0, SCS_SIZE);
-			return s;
+			goto out;
 		}
 	}
 
-	return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+	s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_SCS, PAGE_KERNEL, 0, node,
 				    __builtin_return_address(0));
+
+out:
+	return kasan_reset_tag(s);
 }
 
 void *scs_alloc(int node)
@@ -78,7 +82,7 @@ void scs_free(void *s)
 		if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
 			return;
 
-	kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE);
+	kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
 	vfree_atomic(s);
 }
 
-- 
cgit v1.2.3


From bddac7c1e02ba47f0570e494c9289acea3062cc1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 26 Mar 2022 10:42:04 -0700
Subject: Revert "swiotlb: rework "fix info leak with DMA_FROM_DEVICE""
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13.

It turns out this breaks at least the ath9k wireless driver, and
possibly others.

What the ath9k driver does on packet receive is to set up the DMA
transfer with:

  int ath_rx_init(..)
  ..
                bf->bf_buf_addr = dma_map_single(sc->dev, skb->data,
                                                 common->rx_bufsize,
                                                 DMA_FROM_DEVICE);

and then the receive logic (through ath_rx_tasklet()) will fetch
incoming packets

  static bool ath_edma_get_buffers(..)
  ..
        dma_sync_single_for_cpu(sc->dev, bf->bf_buf_addr,
                                common->rx_bufsize, DMA_FROM_DEVICE);

        ret = ath9k_hw_process_rxdesc_edma(ah, rs, skb->data);
        if (ret == -EINPROGRESS) {
                /*let device gain the buffer again*/
                dma_sync_single_for_device(sc->dev, bf->bf_buf_addr,
                                common->rx_bufsize, DMA_FROM_DEVICE);
                return false;
        }

and it's worth noting how that first DMA sync:

    dma_sync_single_for_cpu(..DMA_FROM_DEVICE);

is there to make sure the CPU can read the DMA buffer (possibly by
copying it from the bounce buffer area, or by doing some cache flush).
The iommu correctly turns that into a "copy from bounce bufer" so that
the driver can look at the state of the packets.

In the meantime, the device may continue to write to the DMA buffer, but
we at least have a snapshot of the state due to that first DMA sync.

But that _second_ DMA sync:

    dma_sync_single_for_device(..DMA_FROM_DEVICE);

is telling the DMA mapping that the CPU wasn't interested in the area
because the packet wasn't there.  In the case of a DMA bounce buffer,
that is a no-op.

Note how it's not a sync for the CPU (the "for_device()" part), and it's
not a sync for data written by the CPU (the "DMA_FROM_DEVICE" part).

Or rather, it _should_ be a no-op.  That's what commit aa6f8dcbab47
broke: it made the code bounce the buffer unconditionally, and changed
the DMA_FROM_DEVICE to just unconditionally and illogically be
DMA_TO_DEVICE.

[ Side note: purely within the confines of the swiotlb driver it wasn't
  entirely illogical: The reason it did that odd DMA_FROM_DEVICE ->
  DMA_TO_DEVICE conversion thing is because inside the swiotlb driver,
  it uses just a swiotlb_bounce() helper that doesn't care about the
  whole distinction of who the sync is for - only which direction to
  bounce.

  So it took the "sync for device" to mean that the CPU must have been
  the one writing, and thought it meant DMA_TO_DEVICE. ]

Also note how the commentary in that commit was wrong, probably due to
that whole confusion, claiming that the commit makes the swiotlb code

                                  "bounce unconditionally (that is, also
    when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale
    data from the swiotlb buffer"

which is nonsensical for two reasons:

 - that "also when dir == DMA_TO_DEVICE" is nonsensical, as that was
   exactly when it always did - and should do - the bounce.

 - since this is a sync for the device (not for the CPU), we're clearly
   fundamentally not coping back stale data from the bounce buffers at
   all, because we'd be copying *to* the bounce buffers.

So that commit was just very confused.  It confused the direction of the
synchronization (to the device, not the cpu) with the direction of the
DMA (from the device).

Reported-and-bisected-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Reported-by: Olha Cherevyk <olha.cherevyk@gmail.com>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Toke Høiland-Jørgensen <toke@toke.dk>
Cc: Maxime Bizon <mbizon@freebox.fr>
Cc: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma/swiotlb.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6db1c475ec82..bfc56cb21705 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -627,14 +627,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	for (i = 0; i < nr_slots(alloc_size + offset); i++)
 		mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(mem->start, index) + offset;
-	/*
-	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
-	 * to the tlb buffer, if we knew for sure the device will
-	 * overwirte the entire current content. But we don't. Thus
-	 * unconditional bounce may prevent leaking swiotlb content (i.e.
-	 * kernel memory) to user-space.
-	 */
-	swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
+	    dir == DMA_BIDIRECTIONAL))
+		swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
@@ -701,13 +697,10 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
 void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
 		size_t size, enum dma_data_direction dir)
 {
-	/*
-	 * Unconditional bounce is necessary to avoid corruption on
-	 * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite
-	 * the whole lengt of the bounce buffer.
-	 */
-	swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
-	BUG_ON(!valid_dma_direction(dir));
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+	else
+		BUG_ON(dir != DMA_FROM_DEVICE);
 }
 
 void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
-- 
cgit v1.2.3


From 901c7280ca0d5e2b4a8929fbe0bfb007ac2a6544 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Mar 2022 11:37:05 -0700
Subject: Reinstate some of "swiotlb: rework "fix info leak with
 DMA_FROM_DEVICE""
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Halil Pasic points out [1] that the full revert of that commit (revert
in bddac7c1e02b), and that a partial revert that only reverts the
problematic case, but still keeps some of the cleanups is probably
better.  ￼

And that partial revert [2] had already been verified by Oleksandr
Natalenko to also fix the issue, I had just missed that in the long
discussion.

So let's reinstate the cleanups from commit aa6f8dcbab47 ("swiotlb:
rework "fix info leak with DMA_FROM_DEVICE""), and effectively only
revert the part that caused problems.

Link: https://lore.kernel.org/all/20220328013731.017ae3e3.pasic@linux.ibm.com/ [1]
Link: https://lore.kernel.org/all/20220324055732.GB12078@lst.de/ [2]
Link: https://lore.kernel.org/all/4386660.LvFx2qVVIh@natalenko.name/ [3]
Suggested-by: Halil Pasic <pasic@linux.ibm.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig" <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma/swiotlb.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index bfc56cb21705..6c350555e5a1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -627,10 +627,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	for (i = 0; i < nr_slots(alloc_size + offset); i++)
 		mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(mem->start, index) + offset;
-	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-	    (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
-	    dir == DMA_BIDIRECTIONAL))
-		swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+	/*
+	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+	 * to the tlb buffer, if we knew for sure the device will
+	 * overwirte the entire current content. But we don't. Thus
+	 * unconditional bounce may prevent leaking swiotlb content (i.e.
+	 * kernel memory) to user-space.
+	 */
+	swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
 }
 
-- 
cgit v1.2.3


From 9052e4e83762c40d89f6650fec161ad9001e93f3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 23 Mar 2022 16:35:26 +0900
Subject: fprobe: Fix smatch type mismatch warning

Fix the type mismatching warning of 'rethook_node vs fprobe_rethook_node'
found by Smatch.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164802092611.1732982.12268174743437084619.stgit@devnote2
---
 kernel/trace/fprobe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 8b2dd5b9dcd1..63b2321b22a0 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -150,15 +150,15 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 
 	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
 	for (i = 0; i < size; i++) {
-		struct rethook_node *node;
+		struct fprobe_rethook_node *node;
 
-		node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL);
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
 		if (!node) {
 			rethook_free(fp->rethook);
 			fp->rethook = NULL;
 			return -ENOMEM;
 		}
-		rethook_add_node(fp->rethook, node);
+		rethook_add_node(fp->rethook, &node->node);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 261608f3105ce65e9fd01919f72ead74dcb9f14d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 23 Mar 2022 16:35:36 +0900
Subject: fprobe: Fix sparse warning for acccessing __rcu ftrace_hash

Since ftrace_ops::local_hash::filter_hash field is an __rcu pointer,
we have to use rcu_access_pointer() to access it.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164802093635.1732982.4938094876018890866.stgit@devnote2
---
 kernel/trace/fprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 63b2321b22a0..89d9f994ebb0 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -215,7 +215,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
 	 * correctly calculate the total number of filtered symbols
 	 * from both filter and notfilter.
 	 */
-	hash = fp->ops.local_hash.filter_hash;
+	hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
 	if (WARN_ON_ONCE(!hash))
 		goto out;
 
-- 
cgit v1.2.3


From c29a4920dfcaa1433b09e2674f605f72767a385c Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Fri, 25 Mar 2022 00:42:38 +0800
Subject: bpf: Fix maximum permitted number of arguments check

Since the m->arg_size array can hold up to MAX_BPF_FUNC_ARGS argument
sizes, it's ok that nargs is equal to MAX_BPF_FUNC_ARGS.

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220324164238.1274915-1-ytcoode@gmail.com
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 24788ce564a0..0918a39279f6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5507,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 	}
 	args = (const struct btf_param *)(func + 1);
 	nargs = btf_type_vlen(func);
-	if (nargs >= MAX_BPF_FUNC_ARGS) {
+	if (nargs > MAX_BPF_FUNC_ARGS) {
 		bpf_log(log,
 			"The function %s has %d arguments. Too many.\n",
 			tname, nargs);
-- 
cgit v1.2.3


From 73f9b911faa74ac5107879de05c9489c419f41bb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Sat, 26 Mar 2022 11:27:05 +0900
Subject: kprobes: Use rethook for kretprobe if possible

Use rethook for kretprobe function return hooking if the arch sets
CONFIG_HAVE_RETHOOK=y. In this case, CONFIG_KRETPROBE_ON_RETHOOK is
set to 'y' automatically, and the kretprobe internal data fields
switches to use rethook. If not, it continues to use kretprobe
specific function return hooks.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164826162556.2455864.12255833167233452047.stgit@devnote2
---
 kernel/Makefile             |   1 +
 kernel/kprobes.c            | 124 +++++++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_kprobe.c |   4 +-
 3 files changed, 107 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..471d71935e90 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_TRACE_CLOCK) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 185badc780b7..dbe57df2e199 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1237,6 +1237,27 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
 }
 NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
 
+static struct kprobe kprobe_busy = {
+	.addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+	struct kprobe_ctlblk *kcb;
+
+	preempt_disable();
+	__this_cpu_write(current_kprobe, &kprobe_busy);
+	kcb = get_kprobe_ctlblk();
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+	__this_cpu_write(current_kprobe, NULL);
+	preempt_enable();
+}
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
 static void free_rp_inst_rcu(struct rcu_head *head)
 {
 	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
@@ -1258,26 +1279,6 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
 }
 NOKPROBE_SYMBOL(recycle_rp_inst);
 
-static struct kprobe kprobe_busy = {
-	.addr = (void *) get_kprobe,
-};
-
-void kprobe_busy_begin(void)
-{
-	struct kprobe_ctlblk *kcb;
-
-	preempt_disable();
-	__this_cpu_write(current_kprobe, &kprobe_busy);
-	kcb = get_kprobe_ctlblk();
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-}
-
-void kprobe_busy_end(void)
-{
-	__this_cpu_write(current_kprobe, NULL);
-	preempt_enable();
-}
-
 /*
  * This function is called from delayed_put_task_struct() when a task is
  * dead and cleaned up to recycle any kretprobe instances associated with
@@ -1327,6 +1328,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
 		rp->rph = NULL;
 	}
 }
+#endif	/* !CONFIG_KRETPROBE_ON_RETHOOK */
 
 /* Add the new probe to 'ap->list'. */
 static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
@@ -1925,6 +1927,7 @@ static struct notifier_block kprobe_exceptions_nb = {
 
 #ifdef CONFIG_KRETPROBES
 
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
 /* This assumes the 'tsk' is the current task or the is not running. */
 static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
 						  struct llist_node **cur)
@@ -2087,6 +2090,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	struct kretprobe_instance *ri;
+	struct rethook_node *rhn;
+
+	rhn = rethook_try_get(rp->rh);
+	if (!rhn) {
+		rp->nmissed++;
+		return 0;
+	}
+
+	ri = container_of(rhn, struct kretprobe_instance, node);
+
+	if (rp->entry_handler && rp->entry_handler(ri, regs))
+		rethook_recycle(rhn);
+	else
+		rethook_hook(rhn, regs, kprobe_ftrace(p));
+
+	return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
+
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+				      struct pt_regs *regs)
+{
+	struct kretprobe *rp = (struct kretprobe *)data;
+	struct kretprobe_instance *ri;
+	struct kprobe_ctlblk *kcb;
+
+	/* The data must NOT be null. This means rethook data structure is broken. */
+	if (WARN_ON_ONCE(!data))
+		return;
+
+	__this_cpu_write(current_kprobe, &rp->kp);
+	kcb = get_kprobe_ctlblk();
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	ri = container_of(rh, struct kretprobe_instance, node);
+	rp->handler(ri, regs);
+
+	__this_cpu_write(current_kprobe, NULL);
+}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
 
 /**
  * kprobe_on_func_entry() -- check whether given address is function entry
@@ -2155,6 +2209,29 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->maxactive = num_possible_cpus();
 #endif
 	}
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+	rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
+	if (!rp->rh)
+		return -ENOMEM;
+
+	for (i = 0; i < rp->maxactive; i++) {
+		inst = kzalloc(sizeof(struct kretprobe_instance) +
+			       rp->data_size, GFP_KERNEL);
+		if (inst == NULL) {
+			rethook_free(rp->rh);
+			rp->rh = NULL;
+			return -ENOMEM;
+		}
+		rethook_add_node(rp->rh, &inst->node);
+	}
+	rp->nmissed = 0;
+	/* Establish function entry probe point */
+	ret = register_kprobe(&rp->kp);
+	if (ret != 0) {
+		rethook_free(rp->rh);
+		rp->rh = NULL;
+	}
+#else	/* !CONFIG_KRETPROBE_ON_RETHOOK */
 	rp->freelist.head = NULL;
 	rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
 	if (!rp->rph)
@@ -2179,6 +2256,7 @@ int register_kretprobe(struct kretprobe *rp)
 	ret = register_kprobe(&rp->kp);
 	if (ret != 0)
 		free_rp_inst(rp);
+#endif
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2217,7 +2295,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 	for (i = 0; i < num; i++) {
 		if (__unregister_kprobe_top(&rps[i]->kp) < 0)
 			rps[i]->kp.addr = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+		rethook_free(rps[i]->rh);
+#else
 		rps[i]->rph->rp = NULL;
+#endif
 	}
 	mutex_unlock(&kprobe_mutex);
 
@@ -2225,7 +2307,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 	for (i = 0; i < num; i++) {
 		if (rps[i]->kp.addr) {
 			__unregister_kprobe_bottom(&rps[i]->kp);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
 			free_rp_inst(rps[i]);
+#endif
 		}
 	}
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b62fd785b599..47cebef78532 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	fbuffer.regs = regs;
 	entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
 	entry->func = (unsigned long)tk->rp.kp.addr;
-	entry->ret_ip = (unsigned long)ri->ret_addr;
+	entry->ret_ip = get_kretprobe_retaddr(ri);
 	store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
 
 	trace_event_buffer_commit(&fbuffer);
@@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		return;
 
 	entry->func = (unsigned long)tk->rp.kp.addr;
-	entry->ret_ip = (unsigned long)ri->ret_addr;
+	entry->ret_ip = get_kretprobe_retaddr(ri);
 	store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL);
-- 
cgit v1.2.3


From d31e0386a2f122b40b605eb0120a2fbcfca77868 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 30 Mar 2022 13:05:10 +0200
Subject: bpf: Fix sparse warnings in kprobe_multi_resolve_syms

Adding missing __user tags to fix sparse warnings:

kernel/trace/bpf_trace.c:2370:34: warning: incorrect type in argument 2 (different address spaces)
kernel/trace/bpf_trace.c:2370:34:    expected void const [noderef] __user *from
kernel/trace/bpf_trace.c:2370:34:    got void const *usyms
kernel/trace/bpf_trace.c:2376:51: warning: incorrect type in argument 2 (different address spaces)
kernel/trace/bpf_trace.c:2376:51:    expected char const [noderef] __user *src
kernel/trace/bpf_trace.c:2376:51:    got char const *
kernel/trace/bpf_trace.c:2443:49: warning: incorrect type in argument 1 (different address spaces)
kernel/trace/bpf_trace.c:2443:49:    expected void const *usyms
kernel/trace/bpf_trace.c:2443:49:    got void [noderef] __user *[assigned] usyms

Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220330110510.398558-1-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7fa2ebc07f60..d8553f46caa2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2349,11 +2349,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
 }
 
 static int
-kprobe_multi_resolve_syms(const void *usyms, u32 cnt,
+kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
 			  unsigned long *addrs)
 {
 	unsigned long addr, size;
-	const char **syms;
+	const char __user **syms;
 	int err = -ENOMEM;
 	unsigned int i;
 	char *func;
-- 
cgit v1.2.3


From a2fb49833cad07a53651c23dce508127085fae2d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 31 Mar 2022 10:11:17 +0900
Subject: rethook: Fix to use WRITE_ONCE() for rethook:: Handler

Since the function pointered by rethook::handler never be removed when
the rethook is alive, it doesn't need to use rcu_assign_pointer() to
update it. Just use WRITE_ONCE().

Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/164868907688.21983.1606862921419988152.stgit@devnote2
---
 kernel/trace/rethook.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index ab463a4d2b23..b56833700d23 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -65,7 +65,7 @@ static void rethook_free_rcu(struct rcu_head *head)
  */
 void rethook_free(struct rethook *rh)
 {
-	rcu_assign_pointer(rh->handler, NULL);
+	WRITE_ONCE(rh->handler, NULL);
 
 	call_rcu(&rh->rcu, rethook_free_rcu);
 }
-- 
cgit v1.2.3


From 7dd5ad2d3e82fb55229e3fe18e09160878e77e20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 31 Mar 2022 10:36:55 +0200
Subject: Revert "signal, x86: Delay calling signals in atomic on RT enabled
 kernels"

Revert commit bf9ad37dc8a. It needs to be better encapsulated and
generalized.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/Kconfig.preempt | 12 +-----------
 kernel/entry/common.c  | 14 --------------
 kernel/signal.c        | 40 ----------------------------------------
 3 files changed, 1 insertion(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 8c6de5a9ecc4..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,14 +133,4 @@ config SCHED_CORE
 	  which is the likely usage by Linux distributions, there should
 	  be no measurable impact on performance.
 
-config ARCH_WANTS_RT_DELAYED_SIGNALS
-	bool
-	help
-	  This option is selected by architectures where raising signals
-	  can happen in atomic contexts on PREEMPT_RT enabled kernels. This
-	  option delays raising the signal until the return to user space
-	  loop where it is also delivered. X86 requires this to deliver
-	  signals from trap handlers which run on IST stacks.
-
-config RT_DELAYED_SIGNALS
-	def_bool PREEMPT_RT && ARCH_WANTS_RT_DELAYED_SIGNALS
+
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index ef8d94a98b7e..e57a224d6b79 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -142,18 +142,6 @@ void noinstr exit_to_user_mode(void)
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-#ifdef CONFIG_RT_DELAYED_SIGNALS
-static inline void raise_delayed_signal(void)
-{
-	if (unlikely(current->forced_info.si_signo)) {
-		force_sig_info(&current->forced_info);
-		current->forced_info.si_signo = 0;
-	}
-}
-#else
-static inline void raise_delayed_signal(void) { }
-#endif
-
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 					    unsigned long ti_work)
 {
@@ -168,8 +156,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 		if (ti_work & _TIF_NEED_RESCHED)
 			schedule();
 
-		raise_delayed_signal();
-
 		if (ti_work & _TIF_UPROBE)
 			uprobe_notify_resume(regs);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 368a34c25bbf..30cd1ca43bcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1307,43 +1307,6 @@ enum sig_handler {
 	HANDLER_EXIT,	 /* Only visible as the process exit code */
 };
 
-/*
- * On some archictectures, PREEMPT_RT has to delay sending a signal from a
- * trap since it cannot enable preemption, and the signal code's
- * spin_locks turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME
- * which will send the signal on exit of the trap.
- */
-#ifdef CONFIG_RT_DELAYED_SIGNALS
-static inline bool force_sig_delayed(struct kernel_siginfo *info,
-				     struct task_struct *t)
-{
-	if (!in_atomic())
-		return false;
-
-	if (WARN_ON_ONCE(t->forced_info.si_signo))
-		return true;
-
-	if (is_si_special(info)) {
-		WARN_ON_ONCE(info != SEND_SIG_PRIV);
-		t->forced_info.si_signo = info->si_signo;
-		t->forced_info.si_errno = 0;
-		t->forced_info.si_code = SI_KERNEL;
-		t->forced_info.si_pid = 0;
-		t->forced_info.si_uid = 0;
-	} else {
-		t->forced_info = *info;
-	}
-	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-	return true;
-}
-#else
-static inline bool force_sig_delayed(struct kernel_siginfo *info,
-				     struct task_struct *t)
-{
-	return false;
-}
-#endif
-
 /*
  * Force a signal that the process can't ignore: if necessary
  * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1364,9 +1327,6 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 	struct k_sigaction *action;
 	int sig = info->si_signo;
 
-	if (force_sig_delayed(info, t))
-		return 0;
-
 	spin_lock_irqsave(&t->sighand->siglock, flags);
 	action = &t->sighand->action[sig-1];
 	ignored = action->sa.sa_handler == SIG_IGN;
-- 
cgit v1.2.3


From 4fe87e818ea492ade079cc01a31d088e445f8539 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 31 Mar 2022 08:01:21 +0200
Subject: dma-mapping: move pgprot_decrypted out of dma_pgprot

pgprot_decrypted is used by AMD SME systems to allow access to memory
that was set to not encrypted using set_memory_decrypted.  That only
happens for dma-direct memory as the IOMMU solves the addressing
challenges for the encryption bit using its own remapping.

Move the pgprot_decrypted call out of dma_pgprot which is also used
by the IOMMU mappings and into dma-direct so that it is only used with
memory that was set decrypted.

Fixes: f5ff79fddf0e ("dma-mapping: remove CONFIG_DMA_REMAP")
Reported-by: Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
---
 kernel/dma/direct.c  | 10 ++++++++--
 kernel/dma/mapping.c |  2 --
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 35a1d29d6a2e..9743c6ccce1a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -277,12 +277,16 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	}
 
 	if (remap) {
+		pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+		if (force_dma_unencrypted(dev))
+			prot = pgprot_decrypted(prot);
+
 		/* remove any dirty cache lines on the kernel alias */
 		arch_dma_prep_coherent(page, size);
 
 		/* create a coherent mapping */
-		ret = dma_common_contiguous_remap(page, size,
-				dma_pgprot(dev, PAGE_KERNEL, attrs),
+		ret = dma_common_contiguous_remap(page, size, prot,
 				__builtin_return_address(0));
 		if (!ret)
 			goto out_free_pages;
@@ -535,6 +539,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 	int ret = -ENXIO;
 
 	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+	if (force_dma_unencrypted(dev))
+		vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 
 	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
 		return ret;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 559461a826ba..db7244291b74 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
  */
 pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
 {
-	if (force_dma_unencrypted(dev))
-		prot = pgprot_decrypted(prot);
 	if (dev_is_dma_coherent(dev))
 		return prot;
 #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
-- 
cgit v1.2.3


From 11e17ae423778f48c84da6a2e215f140610e1973 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 22 Mar 2022 14:21:49 +0800
Subject: bpf: Use swap() instead of open coding it

Clean the following coccicheck warning:

./kernel/trace/bpf_trace.c:2263:34-35: WARNING opportunity for swap().
./kernel/trace/bpf_trace.c:2264:40-41: WARNING opportunity for swap().

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220322062149.109180-1-jiapeng.chong@linux.alibaba.com
---
 kernel/trace/bpf_trace.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7fa2ebc07f60..836f021cb609 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2254,15 +2254,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void
 	const struct bpf_kprobe_multi_link *link = priv;
 	unsigned long *addr_a = a, *addr_b = b;
 	u64 *cookie_a, *cookie_b;
-	unsigned long tmp1;
-	u64 tmp2;
 
 	cookie_a = link->cookies + (addr_a - link->addrs);
 	cookie_b = link->cookies + (addr_b - link->addrs);
 
 	/* swap addr_a/addr_b and cookie_a/cookie_b values */
-	tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
-	tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+	swap(*addr_a, *addr_b);
+	swap(*cookie_a, *cookie_b);
 }
 
 static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
-- 
cgit v1.2.3


From 8eb943fc5e5fa62751248302fe8f6c9856d81c09 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Wed, 23 Mar 2022 15:36:26 +0800
Subject: bpf: Remove redundant assignment to smap->map.value_size

The attr->value_size is already assigned to smap->map.value_size
in bpf_map_init_from_attr(), there is no need to do it again in
stack_map_alloc().

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/bpf/20220323073626.958652-1-ytcoode@gmail.com
---
 kernel/bpf/stackmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 34725bfa1e97..6131b4a19572 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -106,7 +106,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 
 	bpf_map_init_from_attr(&smap->map, attr);
-	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
 
 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
-- 
cgit v1.2.3


From efe34e99fc41ad2b3fd3bbe79ecd3906620ec961 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Mon, 28 Mar 2022 15:32:25 -0700
Subject: tracing/user_events: Hold event_mutex during dyn_event_add

Make sure the event_mutex is properly held during dyn_event_add call.
This is required when adding dynamic events.

Link: https://lkml.kernel.org/r/20220328223225.1992-1-beaub@linux.microsoft.com

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 8b3d241a31c2..61d78d64bdf0 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1165,11 +1165,11 @@ static int user_event_parse(char *name, char *args, char *flags,
 #endif
 
 	mutex_lock(&event_mutex);
+
 	ret = user_event_trace_register(user);
-	mutex_unlock(&event_mutex);
 
 	if (ret)
-		goto put_user;
+		goto put_user_lock;
 
 	user->index = index;
 
@@ -1181,8 +1181,12 @@ static int user_event_parse(char *name, char *args, char *flags,
 	set_bit(user->index, page_bitmap);
 	hash_add(register_table, &user->node, key);
 
+	mutex_unlock(&event_mutex);
+
 	*newuser = user;
 	return 0;
+put_user_lock:
+	mutex_unlock(&event_mutex);
 put_user:
 	user_event_destroy_fields(user);
 	user_event_destroy_validators(user);
-- 
cgit v1.2.3


From 768c1e7f1de03afd0b55e0e951efc272309eeb52 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 29 Mar 2022 10:30:51 -0700
Subject: tracing/user_events: Remove eBPF interfaces

Remove eBPF interfaces within user_events to ensure they are fully
reviewed.

Link: https://lore.kernel.org/all/20220329165718.GA10381@kbox/
Link: https://lkml.kernel.org/r/20220329173051.10087-1-beaub@linux.microsoft.com

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 73 +---------------------------------------
 1 file changed, 1 insertion(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 61d78d64bdf0..846c27bc7aef 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -42,9 +42,6 @@
 #define MAX_FIELD_ARRAY_SIZE 1024
 #define MAX_FIELD_ARG_NAME 256
 
-#define MAX_BPF_COPY_SIZE PAGE_SIZE
-#define MAX_STACK_BPF_DATA 512
-
 static char *register_page_data;
 
 static DEFINE_MUTEX(reg_mutex);
@@ -405,19 +402,6 @@ parse:
 				    type[0] != 'u', FILTER_OTHER);
 }
 
-static void user_event_parse_flags(struct user_event *user, char *flags)
-{
-	char *flag;
-
-	if (flags == NULL)
-		return;
-
-	while ((flag = strsep(&flags, ",")) != NULL) {
-		if (strcmp(flag, "BPF_ITER") == 0)
-			user->flags |= FLAG_BPF_ITER;
-	}
-}
-
 static int user_event_parse_fields(struct user_event *user, char *args)
 {
 	char *field;
@@ -713,64 +697,14 @@ discard:
 }
 
 #ifdef CONFIG_PERF_EVENTS
-static void user_event_bpf(struct user_event *user, struct iov_iter *i)
-{
-	struct user_bpf_context context;
-	struct user_bpf_iter bpf_i;
-	char fast_data[MAX_STACK_BPF_DATA];
-	void *temp = NULL;
-
-	if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
-		/* Raw iterator */
-		context.data_type = USER_BPF_DATA_ITER;
-		context.data_len = i->count;
-		context.iter = &bpf_i;
-
-		bpf_i.iov_offset = i->iov_offset;
-		bpf_i.iov = i->iov;
-		bpf_i.nr_segs = i->nr_segs;
-	} else if (i->nr_segs == 1 && iter_is_iovec(i)) {
-		/* Single buffer from user */
-		context.data_type = USER_BPF_DATA_USER;
-		context.data_len = i->count;
-		context.udata = i->iov->iov_base + i->iov_offset;
-	} else {
-		/* Multi buffer from user */
-		struct iov_iter copy = *i;
-		size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
-
-		context.data_type = USER_BPF_DATA_KERNEL;
-		context.kdata = fast_data;
-
-		if (unlikely(copy_size > sizeof(fast_data))) {
-			temp = kmalloc(copy_size, GFP_NOWAIT);
-
-			if (temp)
-				context.kdata = temp;
-			else
-				copy_size = sizeof(fast_data);
-		}
-
-		context.data_len = copy_nofault(context.kdata,
-						copy_size, &copy);
-	}
-
-	trace_call_bpf(&user->call, &context);
-
-	kfree(temp);
-}
-
 /*
- * Writes the user supplied payload out to perf ring buffer or eBPF program.
+ * Writes the user supplied payload out to perf ring buffer.
  */
 static void user_event_perf(struct user_event *user, struct iov_iter *i,
 			    void *tpdata, bool *faulted)
 {
 	struct hlist_head *perf_head;
 
-	if (bpf_prog_array_valid(&user->call))
-		user_event_bpf(user, i);
-
 	perf_head = this_cpu_ptr(user->call.perf_events);
 
 	if (perf_head && !hlist_empty(perf_head)) {
@@ -1136,8 +1070,6 @@ static int user_event_parse(char *name, char *args, char *flags,
 
 	user->tracepoint.name = name;
 
-	user_event_parse_flags(user, flags);
-
 	ret = user_event_parse_fields(user, args);
 
 	if (ret)
@@ -1579,9 +1511,6 @@ static int user_seq_show(struct seq_file *m, void *p)
 			busy++;
 		}
 
-		if (flags & FLAG_BPF_ITER)
-			seq_puts(m, " FLAG:BPF_ITER");
-
 		seq_puts(m, "\n");
 		active++;
 	}
-- 
cgit v1.2.3


From fcbf591cedbd44d82d7765b3c7e261858108593f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 30 Mar 2022 15:58:35 -0400
Subject: tracing: Set user_events to BROKEN

After being merged, user_events become more visible to a wider audience
that have concerns with the current API. It is too late to fix this for
this release, but instead of a full revert, just mark it as BROKEN (which
prevents it from being selected in make config). Then we can work finding
a better API. If that fails, then it will need to be completely reverted.

Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/
Link: https://lkml.kernel.org/r/20220330155835.5e1f6669@gandalf.local.home

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 16a52a71732d..f83d29adb375 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -741,6 +741,7 @@ config USER_EVENTS
 	bool "User trace events"
 	select TRACING
 	select DYNAMIC_EVENTS
+	depends on BROKEN || COMPILE_TEST # API needs to be straighten out
 	help
 	  User trace events are user-defined trace events that
 	  can be used like an existing kernel trace event.  User trace
-- 
cgit v1.2.3


From 18bfee3216fa6f28d55ebf88d824a539d2bec3c7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 30 Mar 2022 09:00:19 +0200
Subject: ftrace: Make ftrace_graph_is_dead() a static branch

ftrace_graph_is_dead() is used on hot paths, it just reads a variable
in memory and is not worth suffering function call constraints.

For instance, at entry of prepare_ftrace_return(), inlining it avoids
saving prepare_ftrace_return() parameters to stack and restoring them
after calling ftrace_graph_is_dead().

While at it using a static branch is even more performant and is
rather well adapted considering that the returned value will almost
never change.

Inline ftrace_graph_is_dead() and replace 'kill_ftrace_graph' bool
by a static branch.

The performance improvement is noticeable.

Link: https://lkml.kernel.org/r/e0411a6a0ed3eafff0ad2bc9cd4b0e202b4617df.1648623570.git.christophe.leroy@csgroup.eu

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 19028e072cdb..8f4fb328133a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
  *
  * Highly modified by Steven Rostedt (VMware).
  */
+#include <linux/jump_label.h>
 #include <linux/suspend.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
@@ -23,24 +24,12 @@
 #define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
-static bool kill_ftrace_graph;
+DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
-/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
- */
-bool ftrace_graph_is_dead(void)
-{
-	return kill_ftrace_graph;
-}
-
 /**
  * ftrace_graph_stop - set to permanently disable function graph tracing
  *
@@ -51,7 +40,7 @@ bool ftrace_graph_is_dead(void)
  */
 void ftrace_graph_stop(void)
 {
-	kill_ftrace_graph = true;
+	static_branch_enable(&kill_ftrace_graph);
 }
 
 /* Add a function return address to the trace stack on thread info.*/
-- 
cgit v1.2.3


From 5cfff569cab8bf544bab62c911c5d6efd5af5e05 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 1 Apr 2022 14:39:03 -0400
Subject: tracing: Move user_events.h temporarily out of include/uapi

While user_events API is under development and has been marked for broken
to not let the API become fixed, move the header file out of the uapi
directory. This is to prevent it from being installed, then later changed,
and then have an old distro user space update with a new kernel, where
applications see the user_events being available, but the old header is in
place, and then they get compiled incorrectly.

Also, surround the include with CONFIG_COMPILE_TEST to the current
location, but when the BROKEN tag is taken off, it will use the uapi
directory, and fail to compile. This is a good way to remind us to move
the header back.

Link: https://lore.kernel.org/all/20220330155835.5e1f6669@gandalf.local.home
Link: https://lkml.kernel.org/r/20220330201755.29319-1-mathieu.desnoyers@efficios.com
Link: https://lkml.kernel.org/r/20220401143903.188384f3@gandalf.local.home

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 846c27bc7aef..706e1686b5eb 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -18,7 +18,12 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
 #include <uapi/linux/user_events.h>
+#endif
 #include "trace.h"
 #include "trace_dynevent.h"
 
-- 
cgit v1.2.3


From 1cd927ad6f62f27d8908498dcbf61395c5dd5fe2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 1 Apr 2022 14:39:03 -0400
Subject: tracing: mark user_events as BROKEN

After being merged, user_events become more visible to a wider audience
that have concerns with the current API.

It is too late to fix this for this release, but instead of a full
revert, just mark it as BROKEN (which prevents it from being selected in
make config).  Then we can work finding a better API.  If that fails,
then it will need to be completely reverted.

To not have the code silently bitrot, still allow building it with
COMPILE_TEST.

And to prevent the uapi header from being installed, then later changed,
and then have an old distro user space see the old version, move the
header file out of the uapi directory.

Surround the include with CONFIG_COMPILE_TEST to the current location,
but when the BROKEN tag is taken off, it will use the uapi directory,
and fail to compile.  This is a good way to remind us to move the header
back.

Link: https://lore.kernel.org/all/20220330155835.5e1f6669@gandalf.local.home
Link: https://lkml.kernel.org/r/20220330201755.29319-1-mathieu.desnoyers@efficios.com
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/Kconfig             | 1 +
 kernel/trace/trace_events_user.c | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9bb54c0b3b2d..2c43e327a619 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -767,6 +767,7 @@ config USER_EVENTS
 	bool "User trace events"
 	select TRACING
 	select DYNAMIC_EVENTS
+	depends on BROKEN || COMPILE_TEST # API needs to be straighten out
 	help
 	  User trace events are user-defined trace events that
 	  can be used like an existing kernel trace event.  User trace
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 8b3d241a31c2..68d62bfac12f 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -18,7 +18,12 @@
 #include <linux/tracefs.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
 #include <uapi/linux/user_events.h>
+#endif
 #include "trace.h"
 #include "trace_dynevent.h"
 
-- 
cgit v1.2.3


From b490207017ba237d97b735b2aa66dc241ccd18f5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 28 Mar 2022 18:07:04 +0100
Subject: watch_queue: Free the page array when watch_queue is dismantled

Commit 7ea1a0124b6d ("watch_queue: Free the alloc bitmap when the
watch_queue is torn down") took care of the bitmap, but not the page
array.

  BUG: memory leak
  unreferenced object 0xffff88810d9bc140 (size 32):
  comm "syz-executor335", pid 3603, jiffies 4294946994 (age 12.840s)
  hex dump (first 32 bytes):
    40 a7 40 04 00 ea ff ff 00 00 00 00 00 00 00 00  @.@.............
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
     kmalloc_array include/linux/slab.h:621 [inline]
     kcalloc include/linux/slab.h:652 [inline]
     watch_queue_set_size+0x12f/0x2e0 kernel/watch_queue.c:251
     pipe_ioctl+0x82/0x140 fs/pipe.c:632
     vfs_ioctl fs/ioctl.c:51 [inline]
     __do_sys_ioctl fs/ioctl.c:874 [inline]
     __se_sys_ioctl fs/ioctl.c:860 [inline]
     __x64_sys_ioctl+0xfc/0x140 fs/ioctl.c:860
     do_syscall_x64 arch/x86/entry/common.c:50 [inline]

Reported-by: syzbot+25ea042ae28f3888727a@syzkaller.appspotmail.com
Fixes: c73be61cede5 ("pipe: Add general notification queue support")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/r/20220322004654.618274-1-eric.dumazet@gmail.com/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watch_queue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 3990e4df3d7b..230038d4f908 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -370,6 +370,7 @@ static void __put_watch_queue(struct kref *kref)
 
 	for (i = 0; i < wqueue->nr_pages; i++)
 		__free_page(wqueue->notes[i]);
+	kfree(wqueue->notes);
 	bitmap_free(wqueue->notes_bitmap);
 
 	wfilter = rcu_access_pointer(wqueue->filter);
-- 
cgit v1.2.3


From 185da3da9379948ffbe45051b16d526c428fb06e Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Thu, 31 Mar 2022 11:19:29 +0200
Subject: bpf: Replace usage of supported with dedicated list iterator variable

To move the list iterator variable into the list_for_each_entry_*()
macro in the future it should be avoided to use the list iterator
variable after the loop body.

To *never* use the list iterator variable after the loop it was
concluded to use a separate iterator variable instead of a
found boolean [1].

This removes the need to use the found variable (existed & supported)
and simply checking if the variable was set, can determine if the
break/goto was hit.

[1] https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/

Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220331091929.647057-1-jakobkoschel@gmail.com
---
 kernel/bpf/bpf_iter.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 110029ede71e..dea920b3b840 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
 bool bpf_iter_prog_supported(struct bpf_prog *prog)
 {
 	const char *attach_fname = prog->aux->attach_func_name;
+	struct bpf_iter_target_info *tinfo = NULL, *iter;
 	u32 prog_btf_id = prog->aux->attach_btf_id;
 	const char *prefix = BPF_ITER_FUNC_PREFIX;
-	struct bpf_iter_target_info *tinfo;
 	int prefix_len = strlen(prefix);
-	bool supported = false;
 
 	if (strncmp(attach_fname, prefix, prefix_len))
 		return false;
 
 	mutex_lock(&targets_mutex);
-	list_for_each_entry(tinfo, &targets, list) {
-		if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
-			supported = true;
+	list_for_each_entry(iter, &targets, list) {
+		if (iter->btf_id && iter->btf_id == prog_btf_id) {
+			tinfo = iter;
 			break;
 		}
-		if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
-			cache_btf_id(tinfo, prog);
-			supported = true;
+		if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+			cache_btf_id(iter, prog);
+			tinfo = iter;
 			break;
 		}
 	}
 	mutex_unlock(&targets_mutex);
 
-	if (supported) {
+	if (tinfo) {
 		prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
 		prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
 	}
 
-	return supported;
+	return tinfo != NULL;
 }
 
 const struct bpf_func_proto *
@@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link)
 int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 			 struct bpf_prog *prog)
 {
+	struct bpf_iter_target_info *tinfo = NULL, *iter;
 	struct bpf_link_primer link_primer;
-	struct bpf_iter_target_info *tinfo;
 	union bpf_iter_link_info linfo;
 	struct bpf_iter_link *link;
 	u32 prog_btf_id, linfo_len;
-	bool existed = false;
 	bpfptr_t ulinfo;
 	int err;
 
@@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 
 	prog_btf_id = prog->aux->attach_btf_id;
 	mutex_lock(&targets_mutex);
-	list_for_each_entry(tinfo, &targets, list) {
-		if (tinfo->btf_id == prog_btf_id) {
-			existed = true;
+	list_for_each_entry(iter, &targets, list) {
+		if (iter->btf_id == prog_btf_id) {
+			tinfo = iter;
 			break;
 		}
 	}
 	mutex_unlock(&targets_mutex);
-	if (!existed)
+	if (!tinfo)
 		return -ENOENT;
 
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
-- 
cgit v1.2.3


From cfc1d277891eb499b3b5354df33b30f598683e90 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:31 +0000
Subject: module: Move all into module/

No functional changes.

This patch moves all module related code into a separate directory,
modifies each file name and creates a new Makefile. Note: this effort
is in preparation to refactor core module code.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/Makefile            |    5 +-
 kernel/module-internal.h   |   50 -
 kernel/module.c            | 4810 --------------------------------------------
 kernel/module/Makefile     |   12 +
 kernel/module/decompress.c |  273 +++
 kernel/module/internal.h   |   50 +
 kernel/module/main.c       | 4810 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/module/signing.c    |   45 +
 kernel/module_decompress.c |  273 ---
 kernel/module_signing.c    |   45 -
 10 files changed, 5191 insertions(+), 5182 deletions(-)
 delete mode 100644 kernel/module-internal.h
 delete mode 100644 kernel/module.c
 create mode 100644 kernel/module/Makefile
 create mode 100644 kernel/module/decompress.c
 create mode 100644 kernel/module/internal.h
 create mode 100644 kernel/module/main.c
 create mode 100644 kernel/module/signing.c
 delete mode 100644 kernel/module_decompress.c
 delete mode 100644 kernel/module_signing.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 471d71935e90..a7f8c088cc94 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n
 KCSAN_SANITIZE_softirq.o = n
 # These are called from save_stack_trace() on slub debug path,
 # and produce insane amounts of uninteresting coverage.
-KCOV_INSTRUMENT_module.o := n
 KCOV_INSTRUMENT_extable.o := n
 KCOV_INSTRUMENT_stacktrace.o := n
 # Don't self-instrument.
@@ -53,6 +52,7 @@ obj-y += rcu/
 obj-y += livepatch/
 obj-y += dma/
 obj-y += entry/
+obj-$(CONFIG_MODULES) += module/
 
 obj-$(CONFIG_KCMP) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -66,9 +66,6 @@ ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
 obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 8c381c99062f..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/elf.h>
-#include <asm/module.h>
-
-struct load_info {
-	const char *name;
-	/* pointer to module in temporary copy, freed at end of load_module() */
-	struct module *mod;
-	Elf_Ehdr *hdr;
-	unsigned long len;
-	Elf_Shdr *sechdrs;
-	char *secstrings, *strtab;
-	unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
-	struct _ddebug *debug;
-	unsigned int num_debug;
-	bool sig_ok;
-#ifdef CONFIG_KALLSYMS
-	unsigned long mod_kallsyms_init_off;
-#endif
-#ifdef CONFIG_MODULE_DECOMPRESS
-	struct page **pages;
-	unsigned int max_pages;
-	unsigned int used_pages;
-#endif
-	struct {
-		unsigned int sym, str, mod, vers, info, pcpu;
-	} index;
-};
-
-extern int mod_verify_sig(const void *mod, struct load_info *info);
-
-#ifdef CONFIG_MODULE_DECOMPRESS
-int module_decompress(struct load_info *info, const void *buf, size_t size);
-void module_decompress_cleanup(struct load_info *info);
-#else
-static inline int module_decompress(struct load_info *info,
-				    const void *buf, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-static inline void module_decompress_cleanup(struct load_info *info)
-{
-}
-#endif
diff --git a/kernel/module.c b/kernel/module.c
deleted file mode 100644
index 6cea788fd965..000000000000
--- a/kernel/module.c
+++ /dev/null
@@ -1,4810 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2002 Richard Henderson
- * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
- */
-
-#define INCLUDE_VERMAGIC
-
-#include <linux/export.h>
-#include <linux/extable.h>
-#include <linux/moduleloader.h>
-#include <linux/module_signature.h>
-#include <linux/trace_events.h>
-#include <linux/init.h>
-#include <linux/kallsyms.h>
-#include <linux/buildid.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/sysfs.h>
-#include <linux/kernel.h>
-#include <linux/kernel_read_file.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/fcntl.h>
-#include <linux/rcupdate.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/moduleparam.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/vermagic.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <linux/set_memory.h>
-#include <asm/mmu_context.h>
-#include <linux/license.h>
-#include <asm/sections.h>
-#include <linux/tracepoint.h>
-#include <linux/ftrace.h>
-#include <linux/livepatch.h>
-#include <linux/async.h>
-#include <linux/percpu.h>
-#include <linux/kmemleak.h>
-#include <linux/jump_label.h>
-#include <linux/pfn.h>
-#include <linux/bsearch.h>
-#include <linux/dynamic_debug.h>
-#include <linux/audit.h>
-#include <uapi/linux/module.h>
-#include "module-internal.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/module.h>
-
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
-/*
- * Mutex protects:
- * 1) List of modules (also safely readable with preempt_disable),
- * 2) module_use links,
- * 3) module_addr_min/module_addr_max.
- * (delete and add uses RCU list operations).
- */
-static DEFINE_MUTEX(module_mutex);
-static LIST_HEAD(modules);
-
-/* Work queue for freeing init sections in success case */
-static void do_free_init(struct work_struct *w);
-static DECLARE_WORK(init_free_wq, do_free_init);
-static LLIST_HEAD(init_free_list);
-
-#ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
-	return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
-	return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
-	return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
-	unsigned long val = (unsigned long)key;
-	unsigned long start, end;
-
-	start = __mod_tree_val(n);
-	if (val < start)
-		return -1;
-
-	end = start + __mod_tree_size(n);
-	if (val >= end)
-		return 1;
-
-	return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
-	.less = mod_tree_less,
-	.comp = mod_tree_comp,
-};
-
-static struct mod_tree_root {
-	struct latch_tree_root root;
-	unsigned long addr_min;
-	unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
-	.addr_min = -1UL,
-};
-
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
-	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
-	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
-	mod->core_layout.mtn.mod = mod;
-	mod->init_layout.mtn.mod = mod;
-
-	__mod_tree_insert(&mod->core_layout.mtn);
-	if (mod->init_layout.size)
-		__mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
-	if (mod->init_layout.size)
-		__mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
-	__mod_tree_remove(&mod->core_layout.mtn);
-	mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
-	struct latch_tree_node *ltn;
-
-	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
-	if (!ltn)
-		return NULL;
-
-	return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
-	struct module *mod;
-
-	list_for_each_entry_rcu(mod, &modules, list,
-				lockdep_is_held(&module_mutex)) {
-		if (within_module(addr, mod))
-			return mod;
-	}
-
-	return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
-
-/*
- * Bounds of module text, for speeding up __module_address.
- * Protected by module_mutex.
- */
-static void __mod_update_bounds(void *base, unsigned int size)
-{
-	unsigned long min = (unsigned long)base;
-	unsigned long max = min + size;
-
-	if (min < module_addr_min)
-		module_addr_min = min;
-	if (max > module_addr_max)
-		module_addr_max = max;
-}
-
-static void mod_update_bounds(struct module *mod)
-{
-	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
-	if (mod->init_layout.size)
-		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
-}
-
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
-	if (unlikely(!debug_locks))
-		return;
-
-	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
-		!lockdep_is_held(&module_mutex));
-#endif
-}
-
-#ifdef CONFIG_MODULE_SIG
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-void set_module_sig_enforced(void)
-{
-	sig_enforce = true;
-}
-#else
-#define sig_enforce false
-#endif
-
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
-	return sig_enforce;
-}
-EXPORT_SYMBOL(is_module_sig_enforced);
-
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-core_param(nomodule, modules_disabled, bint, 0);
-
-/* Waiting for a module to finish initializing? */
-static DECLARE_WAIT_QUEUE_HEAD(module_wq);
-
-static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-
-int register_module_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(register_module_notifier);
-
-int unregister_module_notifier(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(unregister_module_notifier);
-
-/*
- * We require a truly strong try_module_get(): 0 means success.
- * Otherwise an error is returned due to ongoing or failed
- * initialization etc.
- */
-static inline int strong_try_module_get(struct module *mod)
-{
-	BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
-	if (mod && mod->state == MODULE_STATE_COMING)
-		return -EBUSY;
-	if (try_module_get(mod))
-		return 0;
-	else
-		return -ENOENT;
-}
-
-static inline void add_taint_module(struct module *mod, unsigned flag,
-				    enum lockdep_ok lockdep_ok)
-{
-	add_taint(flag, lockdep_ok);
-	set_bit(flag, &mod->taints);
-}
-
-/*
- * A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit.
- */
-void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
-{
-	module_put(mod);
-	kthread_exit(code);
-}
-EXPORT_SYMBOL(__module_put_and_kthread_exit);
-
-/* Find a module section: 0 means not found. */
-static unsigned int find_sec(const struct load_info *info, const char *name)
-{
-	unsigned int i;
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		Elf_Shdr *shdr = &info->sechdrs[i];
-		/* Alloc bit cleared means "ignore it." */
-		if ((shdr->sh_flags & SHF_ALLOC)
-		    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
-			return i;
-	}
-	return 0;
-}
-
-/* Find a module section, or NULL. */
-static void *section_addr(const struct load_info *info, const char *name)
-{
-	/* Section 0 has sh_addr 0. */
-	return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
-}
-
-/* Find a module section, or NULL.  Fill in number of "objects" in section. */
-static void *section_objs(const struct load_info *info,
-			  const char *name,
-			  size_t object_size,
-			  unsigned int *num)
-{
-	unsigned int sec = find_sec(info, name);
-
-	/* Section 0 has sh_addr 0 and sh_size 0. */
-	*num = info->sechdrs[sec].sh_size / object_size;
-	return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
-static unsigned int find_any_sec(const struct load_info *info, const char *name)
-{
-	unsigned int i;
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		Elf_Shdr *shdr = &info->sechdrs[i];
-		if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
-			return i;
-	}
-	return 0;
-}
-
-/*
- * Find a module section, or NULL. Fill in number of "objects" in section.
- * Ignores SHF_ALLOC flag.
- */
-static __maybe_unused void *any_section_objs(const struct load_info *info,
-					     const char *name,
-					     size_t object_size,
-					     unsigned int *num)
-{
-	unsigned int sec = find_any_sec(info, name);
-
-	/* Section 0 has sh_addr 0 and sh_size 0. */
-	*num = info->sechdrs[sec].sh_size / object_size;
-	return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-
-#ifndef CONFIG_MODVERSIONS
-#define symversion(base, idx) NULL
-#else
-#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
-#endif
-
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const s32 *crcs;
-	enum mod_license {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-	} license;
-};
-
-struct find_symbol_arg {
-	/* Input */
-	const char *name;
-	bool gplok;
-	bool warn;
-
-	/* Output */
-	struct module *owner;
-	const s32 *crc;
-	const struct kernel_symbol *sym;
-	enum mod_license license;
-};
-
-static bool check_exported_symbol(const struct symsearch *syms,
-				  struct module *owner,
-				  unsigned int symnum, void *data)
-{
-	struct find_symbol_arg *fsa = data;
-
-	if (!fsa->gplok && syms->license == GPL_ONLY)
-		return false;
-	fsa->owner = owner;
-	fsa->crc = symversion(syms->crcs, symnum);
-	fsa->sym = &syms->start[symnum];
-	fsa->license = syms->license;
-	return true;
-}
-
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-	return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
-	return sym->value;
-#endif
-}
-
-static const char *kernel_symbol_name(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-	return offset_to_ptr(&sym->name_offset);
-#else
-	return sym->name;
-#endif
-}
-
-static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-	if (!sym->namespace_offset)
-		return NULL;
-	return offset_to_ptr(&sym->namespace_offset);
-#else
-	return sym->namespace;
-#endif
-}
-
-static int cmp_name(const void *name, const void *sym)
-{
-	return strcmp(name, kernel_symbol_name(sym));
-}
-
-static bool find_exported_symbol_in_section(const struct symsearch *syms,
-					    struct module *owner,
-					    void *data)
-{
-	struct find_symbol_arg *fsa = data;
-	struct kernel_symbol *sym;
-
-	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
-			sizeof(struct kernel_symbol), cmp_name);
-
-	if (sym != NULL && check_exported_symbol(syms, owner,
-						 sym - syms->start, data))
-		return true;
-
-	return false;
-}
-
-/*
- * Find an exported symbol and return it, along with, (optional) crc and
- * (optional) module which owns it.  Needs preempt disabled or module_mutex.
- */
-static bool find_symbol(struct find_symbol_arg *fsa)
-{
-	static const struct symsearch arr[] = {
-		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  NOT_GPL_ONLY },
-		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
-		  __start___kcrctab_gpl,
-		  GPL_ONLY },
-	};
-	struct module *mod;
-	unsigned int i;
-
-	module_assert_mutex_or_preempt();
-
-	for (i = 0; i < ARRAY_SIZE(arr); i++)
-		if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
-			return true;
-
-	list_for_each_entry_rcu(mod, &modules, list,
-				lockdep_is_held(&module_mutex)) {
-		struct symsearch arr[] = {
-			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  NOT_GPL_ONLY },
-			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-			  mod->gpl_crcs,
-			  GPL_ONLY },
-		};
-
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-
-		for (i = 0; i < ARRAY_SIZE(arr); i++)
-			if (find_exported_symbol_in_section(&arr[i], mod, fsa))
-				return true;
-	}
-
-	pr_debug("Failed to find symbol %s\n", fsa->name);
-	return false;
-}
-
-/*
- * Search for module by name: must hold module_mutex (or preempt disabled
- * for read-only access).
- */
-static struct module *find_module_all(const char *name, size_t len,
-				      bool even_unformed)
-{
-	struct module *mod;
-
-	module_assert_mutex_or_preempt();
-
-	list_for_each_entry_rcu(mod, &modules, list,
-				lockdep_is_held(&module_mutex)) {
-		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
-			return mod;
-	}
-	return NULL;
-}
-
-struct module *find_module(const char *name)
-{
-	return find_module_all(name, strlen(name), false);
-}
-
-#ifdef CONFIG_SMP
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
-	return mod->percpu;
-}
-
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
-	Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
-	unsigned long align = pcpusec->sh_addralign;
-
-	if (!pcpusec->sh_size)
-		return 0;
-
-	if (align > PAGE_SIZE) {
-		pr_warn("%s: per-cpu alignment %li > %li\n",
-			mod->name, align, PAGE_SIZE);
-		align = PAGE_SIZE;
-	}
-
-	mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
-	if (!mod->percpu) {
-		pr_warn("%s: Could not allocate %lu bytes percpu data\n",
-			mod->name, (unsigned long)pcpusec->sh_size);
-		return -ENOMEM;
-	}
-	mod->percpu_size = pcpusec->sh_size;
-	return 0;
-}
-
-static void percpu_modfree(struct module *mod)
-{
-	free_percpu(mod->percpu);
-}
-
-static unsigned int find_pcpusec(struct load_info *info)
-{
-	return find_sec(info, ".data..percpu");
-}
-
-static void percpu_modcopy(struct module *mod,
-			   const void *from, unsigned long size)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
-	struct module *mod;
-	unsigned int cpu;
-
-	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (!mod->percpu_size)
-			continue;
-		for_each_possible_cpu(cpu) {
-			void *start = per_cpu_ptr(mod->percpu, cpu);
-			void *va = (void *)addr;
-
-			if (va >= start && va < start + mod->percpu_size) {
-				if (can_addr) {
-					*can_addr = (unsigned long) (va - start);
-					*can_addr += (unsigned long)
-						per_cpu_ptr(mod->percpu,
-							    get_boot_cpu_id());
-				}
-				preempt_enable();
-				return true;
-			}
-		}
-	}
-
-	preempt_enable();
-	return false;
-}
-
-/**
- * is_module_percpu_address() - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * Return: %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
-{
-	return __is_module_percpu_address(addr, NULL);
-}
-
-#else /* ... !CONFIG_SMP */
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
-	return NULL;
-}
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
-	/* UP modules shouldn't have this section: ENOMEM isn't quite right */
-	if (info->sechdrs[info->index.pcpu].sh_size != 0)
-		return -ENOMEM;
-	return 0;
-}
-static inline void percpu_modfree(struct module *mod)
-{
-}
-static unsigned int find_pcpusec(struct load_info *info)
-{
-	return 0;
-}
-static inline void percpu_modcopy(struct module *mod,
-				  const void *from, unsigned long size)
-{
-	/* pcpusec should be 0, and size of that section should be 0. */
-	BUG_ON(size != 0);
-}
-bool is_module_percpu_address(unsigned long addr)
-{
-	return false;
-}
-
-bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
-{
-	return false;
-}
-
-#endif /* CONFIG_SMP */
-
-#define MODINFO_ATTR(field)	\
-static void setup_modinfo_##field(struct module *mod, const char *s)  \
-{                                                                     \
-	mod->field = kstrdup(s, GFP_KERNEL);                          \
-}                                                                     \
-static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
-			struct module_kobject *mk, char *buffer)      \
-{                                                                     \
-	return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
-}                                                                     \
-static int modinfo_##field##_exists(struct module *mod)               \
-{                                                                     \
-	return mod->field != NULL;                                    \
-}                                                                     \
-static void free_modinfo_##field(struct module *mod)                  \
-{                                                                     \
-	kfree(mod->field);                                            \
-	mod->field = NULL;                                            \
-}                                                                     \
-static struct module_attribute modinfo_##field = {                    \
-	.attr = { .name = __stringify(field), .mode = 0444 },         \
-	.show = show_modinfo_##field,                                 \
-	.setup = setup_modinfo_##field,                               \
-	.test = modinfo_##field##_exists,                             \
-	.free = free_modinfo_##field,                                 \
-};
-
-MODINFO_ATTR(version);
-MODINFO_ATTR(srcversion);
-
-static char last_unloaded_module[MODULE_NAME_LEN+1];
-
-#ifdef CONFIG_MODULE_UNLOAD
-
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
-/* MODULE_REF_BASE is the base reference count by kmodule loader. */
-#define MODULE_REF_BASE	1
-
-/* Init the unload section of the module. */
-static int module_unload_init(struct module *mod)
-{
-	/*
-	 * Initialize reference counter to MODULE_REF_BASE.
-	 * refcnt == 0 means module is going.
-	 */
-	atomic_set(&mod->refcnt, MODULE_REF_BASE);
-
-	INIT_LIST_HEAD(&mod->source_list);
-	INIT_LIST_HEAD(&mod->target_list);
-
-	/* Hold reference count during initialization. */
-	atomic_inc(&mod->refcnt);
-
-	return 0;
-}
-
-/* Does a already use b? */
-static int already_uses(struct module *a, struct module *b)
-{
-	struct module_use *use;
-
-	list_for_each_entry(use, &b->source_list, source_list) {
-		if (use->source == a) {
-			pr_debug("%s uses %s!\n", a->name, b->name);
-			return 1;
-		}
-	}
-	pr_debug("%s does not use %s!\n", a->name, b->name);
-	return 0;
-}
-
-/*
- * Module a uses b
- *  - we add 'a' as a "source", 'b' as a "target" of module use
- *  - the module_use is added to the list of 'b' sources (so
- *    'b' can walk the list to see who sourced them), and of 'a'
- *    targets (so 'a' can see what modules it targets).
- */
-static int add_module_usage(struct module *a, struct module *b)
-{
-	struct module_use *use;
-
-	pr_debug("Allocating new usage for %s.\n", a->name);
-	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use)
-		return -ENOMEM;
-
-	use->source = a;
-	use->target = b;
-	list_add(&use->source_list, &b->source_list);
-	list_add(&use->target_list, &a->target_list);
-	return 0;
-}
-
-/* Module a uses b: caller needs module_mutex() */
-static int ref_module(struct module *a, struct module *b)
-{
-	int err;
-
-	if (b == NULL || already_uses(a, b))
-		return 0;
-
-	/* If module isn't available, we fail. */
-	err = strong_try_module_get(b);
-	if (err)
-		return err;
-
-	err = add_module_usage(a, b);
-	if (err) {
-		module_put(b);
-		return err;
-	}
-	return 0;
-}
-
-/* Clear the unload stuff of the module. */
-static void module_unload_free(struct module *mod)
-{
-	struct module_use *use, *tmp;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
-		struct module *i = use->target;
-		pr_debug("%s unusing %s\n", mod->name, i->name);
-		module_put(i);
-		list_del(&use->source_list);
-		list_del(&use->target_list);
-		kfree(use);
-	}
-	mutex_unlock(&module_mutex);
-}
-
-#ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force_unload(unsigned int flags)
-{
-	int ret = (flags & O_TRUNC);
-	if (ret)
-		add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
-	return ret;
-}
-#else
-static inline int try_force_unload(unsigned int flags)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-
-/* Try to release refcount of module, 0 means success. */
-static int try_release_module_ref(struct module *mod)
-{
-	int ret;
-
-	/* Try to decrement refcnt which we set at loading */
-	ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
-	BUG_ON(ret < 0);
-	if (ret)
-		/* Someone can put this right now, recover with checking */
-		ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
-
-	return ret;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
-	/* If it's not unused, quit unless we're forcing. */
-	if (try_release_module_ref(mod) != 0) {
-		*forced = try_force_unload(flags);
-		if (!(*forced))
-			return -EWOULDBLOCK;
-	}
-
-	/* Mark it as dying. */
-	mod->state = MODULE_STATE_GOING;
-
-	return 0;
-}
-
-/**
- * module_refcount() - return the refcount or -1 if unloading
- * @mod:	the module we're checking
- *
- * Return:
- *	-1 if the module is in the process of unloading
- *	otherwise the number of references in the kernel to the module
- */
-int module_refcount(struct module *mod)
-{
-	return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
-}
-EXPORT_SYMBOL(module_refcount);
-
-/* This exists whether we can unload or not */
-static void free_module(struct module *mod);
-
-SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
-		unsigned int, flags)
-{
-	struct module *mod;
-	char name[MODULE_NAME_LEN];
-	int ret, forced = 0;
-
-	if (!capable(CAP_SYS_MODULE) || modules_disabled)
-		return -EPERM;
-
-	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
-		return -EFAULT;
-	name[MODULE_NAME_LEN-1] = '\0';
-
-	audit_log_kern_module(name);
-
-	if (mutex_lock_interruptible(&module_mutex) != 0)
-		return -EINTR;
-
-	mod = find_module(name);
-	if (!mod) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (!list_empty(&mod->source_list)) {
-		/* Other modules depend on us: get rid of them first. */
-		ret = -EWOULDBLOCK;
-		goto out;
-	}
-
-	/* Doing init or already dying? */
-	if (mod->state != MODULE_STATE_LIVE) {
-		/* FIXME: if (force), slam module count damn the torpedoes */
-		pr_debug("%s already dying\n", mod->name);
-		ret = -EBUSY;
-		goto out;
-	}
-
-	/* If it has an init func, it must have an exit func to unload */
-	if (mod->init && !mod->exit) {
-		forced = try_force_unload(flags);
-		if (!forced) {
-			/* This module can't be removed */
-			ret = -EBUSY;
-			goto out;
-		}
-	}
-
-	ret = try_stop_module(mod, flags, &forced);
-	if (ret != 0)
-		goto out;
-
-	mutex_unlock(&module_mutex);
-	/* Final destruction now no one is using it. */
-	if (mod->exit != NULL)
-		mod->exit();
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_GOING, mod);
-	klp_module_going(mod);
-	ftrace_release_mod(mod);
-
-	async_synchronize_full();
-
-	/* Store the name of the last unloaded module for diagnostic purposes */
-	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-
-	free_module(mod);
-	/* someone could wait for the module in add_unformed_module() */
-	wake_up_all(&module_wq);
-	return 0;
-out:
-	mutex_unlock(&module_mutex);
-	return ret;
-}
-
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
-	struct module_use *use;
-	int printed_something = 0;
-
-	seq_printf(m, " %i ", module_refcount(mod));
-
-	/*
-	 * Always include a trailing , so userspace can differentiate
-	 * between this and the old multi-field proc format.
-	 */
-	list_for_each_entry(use, &mod->source_list, source_list) {
-		printed_something = 1;
-		seq_printf(m, "%s,", use->source->name);
-	}
-
-	if (mod->init != NULL && mod->exit == NULL) {
-		printed_something = 1;
-		seq_puts(m, "[permanent],");
-	}
-
-	if (!printed_something)
-		seq_puts(m, "-");
-}
-
-void __symbol_put(const char *symbol)
-{
-	struct find_symbol_arg fsa = {
-		.name	= symbol,
-		.gplok	= true,
-	};
-
-	preempt_disable();
-	BUG_ON(!find_symbol(&fsa));
-	module_put(fsa.owner);
-	preempt_enable();
-}
-EXPORT_SYMBOL(__symbol_put);
-
-/* Note this assumes addr is a function, which it currently always is. */
-void symbol_put_addr(void *addr)
-{
-	struct module *modaddr;
-	unsigned long a = (unsigned long)dereference_function_descriptor(addr);
-
-	if (core_kernel_text(a))
-		return;
-
-	/*
-	 * Even though we hold a reference on the module; we still need to
-	 * disable preemption in order to safely traverse the data structure.
-	 */
-	preempt_disable();
-	modaddr = __module_text_address(a);
-	BUG_ON(!modaddr);
-	module_put(modaddr);
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(symbol_put_addr);
-
-static ssize_t show_refcnt(struct module_attribute *mattr,
-			   struct module_kobject *mk, char *buffer)
-{
-	return sprintf(buffer, "%i\n", module_refcount(mk->mod));
-}
-
-static struct module_attribute modinfo_refcnt =
-	__ATTR(refcnt, 0444, show_refcnt, NULL);
-
-void __module_get(struct module *module)
-{
-	if (module) {
-		preempt_disable();
-		atomic_inc(&module->refcnt);
-		trace_module_get(module, _RET_IP_);
-		preempt_enable();
-	}
-}
-EXPORT_SYMBOL(__module_get);
-
-bool try_module_get(struct module *module)
-{
-	bool ret = true;
-
-	if (module) {
-		preempt_disable();
-		/* Note: here, we can fail to get a reference */
-		if (likely(module_is_live(module) &&
-			   atomic_inc_not_zero(&module->refcnt) != 0))
-			trace_module_get(module, _RET_IP_);
-		else
-			ret = false;
-
-		preempt_enable();
-	}
-	return ret;
-}
-EXPORT_SYMBOL(try_module_get);
-
-void module_put(struct module *module)
-{
-	int ret;
-
-	if (module) {
-		preempt_disable();
-		ret = atomic_dec_if_positive(&module->refcnt);
-		WARN_ON(ret < 0);	/* Failed to put refcount */
-		trace_module_put(module, _RET_IP_);
-		preempt_enable();
-	}
-}
-EXPORT_SYMBOL(module_put);
-
-#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
-	/* We don't know the usage count, or what modules are using. */
-	seq_puts(m, " - -");
-}
-
-static inline void module_unload_free(struct module *mod)
-{
-}
-
-static int ref_module(struct module *a, struct module *b)
-{
-	return strong_try_module_get(b);
-}
-
-static inline int module_unload_init(struct module *mod)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULE_UNLOAD */
-
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
-	size_t l = 0;
-	int i;
-
-	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
-		if (taint_flags[i].module && test_bit(i, &mod->taints))
-			buf[l++] = taint_flags[i].c_true;
-	}
-
-	return l;
-}
-
-static ssize_t show_initstate(struct module_attribute *mattr,
-			      struct module_kobject *mk, char *buffer)
-{
-	const char *state = "unknown";
-
-	switch (mk->mod->state) {
-	case MODULE_STATE_LIVE:
-		state = "live";
-		break;
-	case MODULE_STATE_COMING:
-		state = "coming";
-		break;
-	case MODULE_STATE_GOING:
-		state = "going";
-		break;
-	default:
-		BUG();
-	}
-	return sprintf(buffer, "%s\n", state);
-}
-
-static struct module_attribute modinfo_initstate =
-	__ATTR(initstate, 0444, show_initstate, NULL);
-
-static ssize_t store_uevent(struct module_attribute *mattr,
-			    struct module_kobject *mk,
-			    const char *buffer, size_t count)
-{
-	int rc;
-
-	rc = kobject_synth_uevent(&mk->kobj, buffer, count);
-	return rc ? rc : count;
-}
-
-struct module_attribute module_uevent =
-	__ATTR(uevent, 0200, NULL, store_uevent);
-
-static ssize_t show_coresize(struct module_attribute *mattr,
-			     struct module_kobject *mk, char *buffer)
-{
-	return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
-}
-
-static struct module_attribute modinfo_coresize =
-	__ATTR(coresize, 0444, show_coresize, NULL);
-
-static ssize_t show_initsize(struct module_attribute *mattr,
-			     struct module_kobject *mk, char *buffer)
-{
-	return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
-}
-
-static struct module_attribute modinfo_initsize =
-	__ATTR(initsize, 0444, show_initsize, NULL);
-
-static ssize_t show_taint(struct module_attribute *mattr,
-			  struct module_kobject *mk, char *buffer)
-{
-	size_t l;
-
-	l = module_flags_taint(mk->mod, buffer);
-	buffer[l++] = '\n';
-	return l;
-}
-
-static struct module_attribute modinfo_taint =
-	__ATTR(taint, 0444, show_taint, NULL);
-
-static struct module_attribute *modinfo_attrs[] = {
-	&module_uevent,
-	&modinfo_version,
-	&modinfo_srcversion,
-	&modinfo_initstate,
-	&modinfo_coresize,
-	&modinfo_initsize,
-	&modinfo_taint,
-#ifdef CONFIG_MODULE_UNLOAD
-	&modinfo_refcnt,
-#endif
-	NULL,
-};
-
-static const char vermagic[] = VERMAGIC_STRING;
-
-static int try_to_force_load(struct module *mod, const char *reason)
-{
-#ifdef CONFIG_MODULE_FORCE_LOAD
-	if (!test_taint(TAINT_FORCED_MODULE))
-		pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
-	add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
-	return 0;
-#else
-	return -ENOEXEC;
-#endif
-}
-
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
-	return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
-			 const char *symname,
-			 struct module *mod,
-			 const s32 *crc)
-{
-	Elf_Shdr *sechdrs = info->sechdrs;
-	unsigned int versindex = info->index.vers;
-	unsigned int i, num_versions;
-	struct modversion_info *versions;
-
-	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
-	if (!crc)
-		return 1;
-
-	/* No versions at all?  modprobe --force does this. */
-	if (versindex == 0)
-		return try_to_force_load(mod, symname) == 0;
-
-	versions = (void *) sechdrs[versindex].sh_addr;
-	num_versions = sechdrs[versindex].sh_size
-		/ sizeof(struct modversion_info);
-
-	for (i = 0; i < num_versions; i++) {
-		u32 crcval;
-
-		if (strcmp(versions[i].name, symname) != 0)
-			continue;
-
-		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
-			crcval = resolve_rel_crc(crc);
-		else
-			crcval = *crc;
-		if (versions[i].crc == crcval)
-			return 1;
-		pr_debug("Found checksum %X vs module %lX\n",
-			 crcval, versions[i].crc);
-		goto bad_version;
-	}
-
-	/* Broken toolchain. Warn once, then let it go.. */
-	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
-	return 1;
-
-bad_version:
-	pr_warn("%s: disagrees about version of symbol %s\n",
-	       info->name, symname);
-	return 0;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
-					  struct module *mod)
-{
-	struct find_symbol_arg fsa = {
-		.name	= "module_layout",
-		.gplok	= true,
-	};
-
-	/*
-	 * Since this should be found in kernel (which can't be removed), no
-	 * locking is necessary -- use preempt_disable() to placate lockdep.
-	 */
-	preempt_disable();
-	if (!find_symbol(&fsa)) {
-		preempt_enable();
-		BUG();
-	}
-	preempt_enable();
-	return check_version(info, "module_layout", mod, fsa.crc);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
-			     bool has_crcs)
-{
-	if (has_crcs) {
-		amagic += strcspn(amagic, " ");
-		bmagic += strcspn(bmagic, " ");
-	}
-	return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
-				const char *symname,
-				struct module *mod,
-				const s32 *crc)
-{
-	return 1;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
-					  struct module *mod)
-{
-	return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
-			     bool has_crcs)
-{
-	return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
-static char *get_modinfo(const struct load_info *info, const char *tag);
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
-			      char *prev);
-
-static int verify_namespace_is_imported(const struct load_info *info,
-					const struct kernel_symbol *sym,
-					struct module *mod)
-{
-	const char *namespace;
-	char *imported_namespace;
-
-	namespace = kernel_symbol_namespace(sym);
-	if (namespace && namespace[0]) {
-		imported_namespace = get_modinfo(info, "import_ns");
-		while (imported_namespace) {
-			if (strcmp(namespace, imported_namespace) == 0)
-				return 0;
-			imported_namespace = get_next_modinfo(
-				info, "import_ns", imported_namespace);
-		}
-#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
-		pr_warn(
-#else
-		pr_err(
-#endif
-			"%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
-			mod->name, kernel_symbol_name(sym), namespace);
-#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
-		return -EINVAL;
-#endif
-	}
-	return 0;
-}
-
-static bool inherit_taint(struct module *mod, struct module *owner)
-{
-	if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
-		return true;
-
-	if (mod->using_gplonly_symbols) {
-		pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
-			mod->name, owner->name);
-		return false;
-	}
-
-	if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
-		pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
-			mod->name, owner->name);
-		set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
-	}
-	return true;
-}
-
-/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(struct module *mod,
-						  const struct load_info *info,
-						  const char *name,
-						  char ownername[])
-{
-	struct find_symbol_arg fsa = {
-		.name	= name,
-		.gplok	= !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
-		.warn	= true,
-	};
-	int err;
-
-	/*
-	 * The module_mutex should not be a heavily contended lock;
-	 * if we get the occasional sleep here, we'll go an extra iteration
-	 * in the wait_event_interruptible(), which is harmless.
-	 */
-	sched_annotate_sleep();
-	mutex_lock(&module_mutex);
-	if (!find_symbol(&fsa))
-		goto unlock;
-
-	if (fsa.license == GPL_ONLY)
-		mod->using_gplonly_symbols = true;
-
-	if (!inherit_taint(mod, fsa.owner)) {
-		fsa.sym = NULL;
-		goto getname;
-	}
-
-	if (!check_version(info, name, mod, fsa.crc)) {
-		fsa.sym = ERR_PTR(-EINVAL);
-		goto getname;
-	}
-
-	err = verify_namespace_is_imported(info, fsa.sym, mod);
-	if (err) {
-		fsa.sym = ERR_PTR(err);
-		goto getname;
-	}
-
-	err = ref_module(mod, fsa.owner);
-	if (err) {
-		fsa.sym = ERR_PTR(err);
-		goto getname;
-	}
-
-getname:
-	/* We must make copy under the lock if we failed to get ref. */
-	strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
-unlock:
-	mutex_unlock(&module_mutex);
-	return fsa.sym;
-}
-
-static const struct kernel_symbol *
-resolve_symbol_wait(struct module *mod,
-		    const struct load_info *info,
-		    const char *name)
-{
-	const struct kernel_symbol *ksym;
-	char owner[MODULE_NAME_LEN];
-
-	if (wait_event_interruptible_timeout(module_wq,
-			!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
-			|| PTR_ERR(ksym) != -EBUSY,
-					     30 * HZ) <= 0) {
-		pr_warn("%s: gave up waiting for init of module %s.\n",
-			mod->name, owner);
-	}
-	return ksym;
-}
-
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
-	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-#endif
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
-	struct bin_attribute battr;
-	unsigned long address;
-};
-
-struct module_sect_attrs {
-	struct attribute_group grp;
-	unsigned int nsections;
-	struct module_sect_attr attrs[];
-};
-
-#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
-				struct bin_attribute *battr,
-				char *buf, loff_t pos, size_t count)
-{
-	struct module_sect_attr *sattr =
-		container_of(battr, struct module_sect_attr, battr);
-	char bounce[MODULE_SECT_READ_SIZE + 1];
-	size_t wrote;
-
-	if (pos != 0)
-		return -EINVAL;
-
-	/*
-	 * Since we're a binary read handler, we must account for the
-	 * trailing NUL byte that sprintf will write: if "buf" is
-	 * too small to hold the NUL, or the NUL is exactly the last
-	 * byte, the read will look like it got truncated by one byte.
-	 * Since there is no way to ask sprintf nicely to not write
-	 * the NUL, we have to use a bounce buffer.
-	 */
-	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
-			 kallsyms_show_value(file->f_cred)
-				? (void *)sattr->address : NULL);
-	count = min(count, wrote);
-	memcpy(buf, bounce, count);
-
-	return count;
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
-	unsigned int section;
-
-	for (section = 0; section < sect_attrs->nsections; section++)
-		kfree(sect_attrs->attrs[section].battr.attr.name);
-	kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
-	unsigned int nloaded = 0, i, size[2];
-	struct module_sect_attrs *sect_attrs;
-	struct module_sect_attr *sattr;
-	struct bin_attribute **gattr;
-
-	/* Count loaded sections and allocate structures */
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		if (!sect_empty(&info->sechdrs[i]))
-			nloaded++;
-	size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
-			sizeof(sect_attrs->grp.bin_attrs[0]));
-	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
-	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
-	if (sect_attrs == NULL)
-		return;
-
-	/* Setup section attributes. */
-	sect_attrs->grp.name = "sections";
-	sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
-	sect_attrs->nsections = 0;
-	sattr = &sect_attrs->attrs[0];
-	gattr = &sect_attrs->grp.bin_attrs[0];
-	for (i = 0; i < info->hdr->e_shnum; i++) {
-		Elf_Shdr *sec = &info->sechdrs[i];
-		if (sect_empty(sec))
-			continue;
-		sysfs_bin_attr_init(&sattr->battr);
-		sattr->address = sec->sh_addr;
-		sattr->battr.attr.name =
-			kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
-		if (sattr->battr.attr.name == NULL)
-			goto out;
-		sect_attrs->nsections++;
-		sattr->battr.read = module_sect_read;
-		sattr->battr.size = MODULE_SECT_READ_SIZE;
-		sattr->battr.attr.mode = 0400;
-		*(gattr++) = &(sattr++)->battr;
-	}
-	*gattr = NULL;
-
-	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
-		goto out;
-
-	mod->sect_attrs = sect_attrs;
-	return;
-  out:
-	free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
-	if (mod->sect_attrs) {
-		sysfs_remove_group(&mod->mkobj.kobj,
-				   &mod->sect_attrs->grp);
-		/*
-		 * We are positive that no one is using any sect attrs
-		 * at this point.  Deallocate immediately.
-		 */
-		free_sect_attrs(mod->sect_attrs);
-		mod->sect_attrs = NULL;
-	}
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
-	struct kobject *dir;
-	unsigned int notes;
-	struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
-				 struct bin_attribute *bin_attr,
-				 char *buf, loff_t pos, size_t count)
-{
-	/*
-	 * The caller checked the pos and count against our size.
-	 */
-	memcpy(buf, bin_attr->private + pos, count);
-	return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
-			     unsigned int i)
-{
-	if (notes_attrs->dir) {
-		while (i-- > 0)
-			sysfs_remove_bin_file(notes_attrs->dir,
-					      &notes_attrs->attrs[i]);
-		kobject_put(notes_attrs->dir);
-	}
-	kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
-	unsigned int notes, loaded, i;
-	struct module_notes_attrs *notes_attrs;
-	struct bin_attribute *nattr;
-
-	/* failed to create section attributes, so can't create notes */
-	if (!mod->sect_attrs)
-		return;
-
-	/* Count notes sections and allocate structures.  */
-	notes = 0;
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		if (!sect_empty(&info->sechdrs[i]) &&
-		    (info->sechdrs[i].sh_type == SHT_NOTE))
-			++notes;
-
-	if (notes == 0)
-		return;
-
-	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
-			      GFP_KERNEL);
-	if (notes_attrs == NULL)
-		return;
-
-	notes_attrs->notes = notes;
-	nattr = &notes_attrs->attrs[0];
-	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
-		if (sect_empty(&info->sechdrs[i]))
-			continue;
-		if (info->sechdrs[i].sh_type == SHT_NOTE) {
-			sysfs_bin_attr_init(nattr);
-			nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
-			nattr->attr.mode = S_IRUGO;
-			nattr->size = info->sechdrs[i].sh_size;
-			nattr->private = (void *) info->sechdrs[i].sh_addr;
-			nattr->read = module_notes_read;
-			++nattr;
-		}
-		++loaded;
-	}
-
-	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
-	if (!notes_attrs->dir)
-		goto out;
-
-	for (i = 0; i < notes; ++i)
-		if (sysfs_create_bin_file(notes_attrs->dir,
-					  &notes_attrs->attrs[i]))
-			goto out;
-
-	mod->notes_attrs = notes_attrs;
-	return;
-
-  out:
-	free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
-	if (mod->notes_attrs)
-		free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
-				  const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
-				   const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
-	struct module_use *use;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list)
-		sysfs_remove_link(use->target->holders_dir, mod->name);
-	mutex_unlock(&module_mutex);
-#endif
-}
-
-static int add_usage_links(struct module *mod)
-{
-	int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
-	struct module_use *use;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list) {
-		ret = sysfs_create_link(use->target->holders_dir,
-					&mod->mkobj.kobj, mod->name);
-		if (ret)
-			break;
-	}
-	mutex_unlock(&module_mutex);
-	if (ret)
-		del_usage_links(mod);
-#endif
-	return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
-	struct module_attribute *attr;
-	struct module_attribute *temp_attr;
-	int error = 0;
-	int i;
-
-	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
-					(ARRAY_SIZE(modinfo_attrs) + 1)),
-					GFP_KERNEL);
-	if (!mod->modinfo_attrs)
-		return -ENOMEM;
-
-	temp_attr = mod->modinfo_attrs;
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
-		if (!attr->test || attr->test(mod)) {
-			memcpy(temp_attr, attr, sizeof(*temp_attr));
-			sysfs_attr_init(&temp_attr->attr);
-			error = sysfs_create_file(&mod->mkobj.kobj,
-					&temp_attr->attr);
-			if (error)
-				goto error_out;
-			++temp_attr;
-		}
-	}
-
-	return 0;
-
-error_out:
-	if (i > 0)
-		module_remove_modinfo_attrs(mod, --i);
-	else
-		kfree(mod->modinfo_attrs);
-	return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-	struct module_attribute *attr;
-	int i;
-
-	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
-		if (end >= 0 && i > end)
-			break;
-		/* pick a field to test for end of list */
-		if (!attr->attr.name)
-			break;
-		sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
-		if (attr->free)
-			attr->free(mod);
-	}
-	kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
-	DECLARE_COMPLETION_ONSTACK(c);
-	mod->mkobj.kobj_completion = &c;
-	kobject_put(&mod->mkobj.kobj);
-	wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
-	int err;
-	struct kobject *kobj;
-
-	if (!module_sysfs_initialized) {
-		pr_err("%s: module sysfs not initialized\n", mod->name);
-		err = -EINVAL;
-		goto out;
-	}
-
-	kobj = kset_find_obj(module_kset, mod->name);
-	if (kobj) {
-		pr_err("%s: module is already loaded\n", mod->name);
-		kobject_put(kobj);
-		err = -EINVAL;
-		goto out;
-	}
-
-	mod->mkobj.mod = mod;
-
-	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-	mod->mkobj.kobj.kset = module_kset;
-	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
-				   "%s", mod->name);
-	if (err)
-		mod_kobject_put(mod);
-
-out:
-	return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
-			   const struct load_info *info,
-			   struct kernel_param *kparam,
-			   unsigned int num_params)
-{
-	int err;
-
-	err = mod_sysfs_init(mod);
-	if (err)
-		goto out;
-
-	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
-	if (!mod->holders_dir) {
-		err = -ENOMEM;
-		goto out_unreg;
-	}
-
-	err = module_param_sysfs_setup(mod, kparam, num_params);
-	if (err)
-		goto out_unreg_holders;
-
-	err = module_add_modinfo_attrs(mod);
-	if (err)
-		goto out_unreg_param;
-
-	err = add_usage_links(mod);
-	if (err)
-		goto out_unreg_modinfo_attrs;
-
-	add_sect_attrs(mod, info);
-	add_notes_attrs(mod, info);
-
-	return 0;
-
-out_unreg_modinfo_attrs:
-	module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
-	module_param_sysfs_remove(mod);
-out_unreg_holders:
-	kobject_put(mod->holders_dir);
-out_unreg:
-	mod_kobject_put(mod);
-out:
-	return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-	remove_notes_attrs(mod);
-	remove_sect_attrs(mod);
-	mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
-	mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
-			   const struct load_info *info,
-			   struct kernel_param *kparam,
-			   unsigned int num_params)
-{
-	return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
-	del_usage_links(mod);
-	module_remove_modinfo_attrs(mod, -1);
-	module_param_sysfs_remove(mod);
-	kobject_put(mod->mkobj.drivers_dir);
-	kobject_put(mod->holders_dir);
-	mod_sysfs_fini(mod);
-}
-
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- *          [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^                ^               ^               ^
- * ro_size ------------------------|               |               |
- * ro_after_init_size -----------------------------|               |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
-		      int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base,
-		   layout->text_size >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
-	frob_text(&mod->core_layout, set_memory_x);
-	frob_text(&mod->init_layout, set_memory_x);
-}
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
-			int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->text_size,
-		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
-				int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
-			       int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
-		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
-	if (!rodata_enabled)
-		return;
-
-	set_vm_flush_reset_perms(mod->core_layout.base);
-	set_vm_flush_reset_perms(mod->init_layout.base);
-	frob_text(&mod->core_layout, set_memory_ro);
-
-	frob_rodata(&mod->core_layout, set_memory_ro);
-	frob_text(&mod->init_layout, set_memory_ro);
-	frob_rodata(&mod->init_layout, set_memory_ro);
-
-	if (after_init)
-		frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
-	frob_rodata(&mod->core_layout, set_memory_nx);
-	frob_ro_after_init(&mod->core_layout, set_memory_nx);
-	frob_writable_data(&mod->core_layout, set_memory_nx);
-	frob_rodata(&mod->init_layout, set_memory_nx);
-	frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-				       char *secstrings, struct module *mod)
-{
-	const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
-	int i;
-
-	for (i = 0; i < hdr->e_shnum; i++) {
-		if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
-			pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
-				mod->name, secstrings + sechdrs[i].sh_name, i);
-			return -ENOEXEC;
-		}
-	}
-
-	return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-				       char *secstrings, struct module *mod)
-{
-	return 0;
-}
-#endif /*  CONFIG_STRICT_MODULE_RWX */
-
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
-	unsigned int size, symndx;
-	int ret;
-
-	size = sizeof(*mod->klp_info);
-	mod->klp_info = kmalloc(size, GFP_KERNEL);
-	if (mod->klp_info == NULL)
-		return -ENOMEM;
-
-	/* Elf header */
-	size = sizeof(mod->klp_info->hdr);
-	memcpy(&mod->klp_info->hdr, info->hdr, size);
-
-	/* Elf section header table */
-	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
-	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
-	if (mod->klp_info->sechdrs == NULL) {
-		ret = -ENOMEM;
-		goto free_info;
-	}
-
-	/* Elf section name string table */
-	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
-	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
-	if (mod->klp_info->secstrings == NULL) {
-		ret = -ENOMEM;
-		goto free_sechdrs;
-	}
-
-	/* Elf symbol section index */
-	symndx = info->index.sym;
-	mod->klp_info->symndx = symndx;
-
-	/*
-	 * For livepatch modules, core_kallsyms.symtab is a complete
-	 * copy of the original symbol table. Adjust sh_addr to point
-	 * to core_kallsyms.symtab since the copy of the symtab in module
-	 * init memory is freed at the end of do_init_module().
-	 */
-	mod->klp_info->sechdrs[symndx].sh_addr = \
-		(unsigned long) mod->core_kallsyms.symtab;
-
-	return 0;
-
-free_sechdrs:
-	kfree(mod->klp_info->sechdrs);
-free_info:
-	kfree(mod->klp_info);
-	return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
-	kfree(mod->klp_info->sechdrs);
-	kfree(mod->klp_info->secstrings);
-	kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
-	return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
-}
-#endif /* CONFIG_LIVEPATCH */
-
-void __weak module_memfree(void *module_region)
-{
-	/*
-	 * This memory may be RO, and freeing RO memory in an interrupt is not
-	 * supported by vmalloc.
-	 */
-	WARN_ON(in_interrupt());
-	vfree(module_region);
-}
-
-void __weak module_arch_cleanup(struct module *mod)
-{
-}
-
-void __weak module_arch_freeing_init(struct module *mod)
-{
-}
-
-static void cfi_cleanup(struct module *mod);
-
-/* Free a module, remove from lists, etc. */
-static void free_module(struct module *mod)
-{
-	trace_module_free(mod);
-
-	mod_sysfs_teardown(mod);
-
-	/*
-	 * We leave it in list to prevent duplicate loads, but make sure
-	 * that noone uses it while it's being deconstructed.
-	 */
-	mutex_lock(&module_mutex);
-	mod->state = MODULE_STATE_UNFORMED;
-	mutex_unlock(&module_mutex);
-
-	/* Remove dynamic debug info */
-	ddebug_remove_module(mod->name);
-
-	/* Arch-specific cleanup. */
-	module_arch_cleanup(mod);
-
-	/* Module unload stuff */
-	module_unload_free(mod);
-
-	/* Free any allocated parameters. */
-	destroy_params(mod->kp, mod->num_kp);
-
-	if (is_livepatch_module(mod))
-		free_module_elf(mod);
-
-	/* Now we can delete it from the lists */
-	mutex_lock(&module_mutex);
-	/* Unlink carefully: kallsyms could be walking list. */
-	list_del_rcu(&mod->list);
-	mod_tree_remove(mod);
-	/* Remove this module from bug list, this uses list_del_rcu */
-	module_bug_cleanup(mod);
-	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
-	synchronize_rcu();
-	mutex_unlock(&module_mutex);
-
-	/* Clean up CFI for the module. */
-	cfi_cleanup(mod);
-
-	/* This may be empty, but that's OK */
-	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
-	kfree(mod->args);
-	percpu_modfree(mod);
-
-	/* Free lock-classes; relies on the preceding sync_rcu(). */
-	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
-	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
-}
-
-void *__symbol_get(const char *symbol)
-{
-	struct find_symbol_arg fsa = {
-		.name	= symbol,
-		.gplok	= true,
-		.warn	= true,
-	};
-
-	preempt_disable();
-	if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
-		preempt_enable();
-		return NULL;
-	}
-	preempt_enable();
-	return (void *)kernel_symbol_value(fsa.sym);
-}
-EXPORT_SYMBOL_GPL(__symbol_get);
-
-/*
- * Ensure that an exported symbol [global namespace] does not already exist
- * in the kernel or in some other module's exported symbol table.
- *
- * You must hold the module_mutex.
- */
-static int verify_exported_symbols(struct module *mod)
-{
-	unsigned int i;
-	const struct kernel_symbol *s;
-	struct {
-		const struct kernel_symbol *sym;
-		unsigned int num;
-	} arr[] = {
-		{ mod->syms, mod->num_syms },
-		{ mod->gpl_syms, mod->num_gpl_syms },
-	};
-
-	for (i = 0; i < ARRAY_SIZE(arr); i++) {
-		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			struct find_symbol_arg fsa = {
-				.name	= kernel_symbol_name(s),
-				.gplok	= true,
-			};
-			if (find_symbol(&fsa)) {
-				pr_err("%s: exports duplicate symbol %s"
-				       " (owned by %s)\n",
-				       mod->name, kernel_symbol_name(s),
-				       module_name(fsa.owner));
-				return -ENOEXEC;
-			}
-		}
-	}
-	return 0;
-}
-
-static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
-{
-	/*
-	 * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
-	 * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
-	 * i386 has a similar problem but may not deserve a fix.
-	 *
-	 * If we ever have to ignore many symbols, consider refactoring the code to
-	 * only warn if referenced by a relocation.
-	 */
-	if (emachine == EM_386 || emachine == EM_X86_64)
-		return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
-	return false;
-}
-
-/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(struct module *mod, const struct load_info *info)
-{
-	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-	Elf_Sym *sym = (void *)symsec->sh_addr;
-	unsigned long secbase;
-	unsigned int i;
-	int ret = 0;
-	const struct kernel_symbol *ksym;
-
-	for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
-		const char *name = info->strtab + sym[i].st_name;
-
-		switch (sym[i].st_shndx) {
-		case SHN_COMMON:
-			/* Ignore common symbols */
-			if (!strncmp(name, "__gnu_lto", 9))
-				break;
-
-			/*
-			 * We compiled with -fno-common.  These are not
-			 * supposed to happen.
-			 */
-			pr_debug("Common symbol: %s\n", name);
-			pr_warn("%s: please compile with -fno-common\n",
-			       mod->name);
-			ret = -ENOEXEC;
-			break;
-
-		case SHN_ABS:
-			/* Don't need to do anything */
-			pr_debug("Absolute symbol: 0x%08lx\n",
-			       (long)sym[i].st_value);
-			break;
-
-		case SHN_LIVEPATCH:
-			/* Livepatch symbols are resolved by livepatch */
-			break;
-
-		case SHN_UNDEF:
-			ksym = resolve_symbol_wait(mod, info, name);
-			/* Ok if resolved.  */
-			if (ksym && !IS_ERR(ksym)) {
-				sym[i].st_value = kernel_symbol_value(ksym);
-				break;
-			}
-
-			/* Ok if weak or ignored.  */
-			if (!ksym &&
-			    (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
-			     ignore_undef_symbol(info->hdr->e_machine, name)))
-				break;
-
-			ret = PTR_ERR(ksym) ?: -ENOENT;
-			pr_warn("%s: Unknown symbol %s (err %d)\n",
-				mod->name, name, ret);
-			break;
-
-		default:
-			/* Divert to percpu allocation if a percpu var. */
-			if (sym[i].st_shndx == info->index.pcpu)
-				secbase = (unsigned long)mod_percpu(mod);
-			else
-				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
-			sym[i].st_value += secbase;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-static int apply_relocations(struct module *mod, const struct load_info *info)
-{
-	unsigned int i;
-	int err = 0;
-
-	/* Now do relocations. */
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		unsigned int infosec = info->sechdrs[i].sh_info;
-
-		/* Not a valid relocation section? */
-		if (infosec >= info->hdr->e_shnum)
-			continue;
-
-		/* Don't bother with non-allocated sections */
-		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
-			continue;
-
-		if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
-			err = klp_apply_section_relocs(mod, info->sechdrs,
-						       info->secstrings,
-						       info->strtab,
-						       info->index.sym, i,
-						       NULL);
-		else if (info->sechdrs[i].sh_type == SHT_REL)
-			err = apply_relocate(info->sechdrs, info->strtab,
-					     info->index.sym, i, mod);
-		else if (info->sechdrs[i].sh_type == SHT_RELA)
-			err = apply_relocate_add(info->sechdrs, info->strtab,
-						 info->index.sym, i, mod);
-		if (err < 0)
-			break;
-	}
-	return err;
-}
-
-/* Additional bytes needed by arch in front of individual sections */
-unsigned int __weak arch_mod_section_prepend(struct module *mod,
-					     unsigned int section)
-{
-	/* default implementation just returns zero */
-	return 0;
-}
-
-/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
-		       Elf_Shdr *sechdr, unsigned int section)
-{
-	long ret;
-
-	*size += arch_mod_section_prepend(mod, section);
-	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
-	*size = ret + sechdr->sh_size;
-	return ret;
-}
-
-static bool module_init_layout_section(const char *sname)
-{
-#ifndef CONFIG_MODULE_UNLOAD
-	if (module_exit_section(sname))
-		return true;
-#endif
-	return module_init_section(sname);
-}
-
-/*
- * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- * might -- code, read-only data, read-write data, small data.  Tally
- * sizes, and place the offsets into sh_entsize fields: high bit means it
- * belongs in init.
- */
-static void layout_sections(struct module *mod, struct load_info *info)
-{
-	static unsigned long const masks[][2] = {
-		/*
-		 * NOTE: all executable code must be the first section
-		 * in this array; otherwise modify the text_size
-		 * finder in the two loops below
-		 */
-		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
-		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
-		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
-		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
-		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
-	};
-	unsigned int m, i;
-
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		info->sechdrs[i].sh_entsize = ~0UL;
-
-	pr_debug("Core section allocation order:\n");
-	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-		for (i = 0; i < info->hdr->e_shnum; ++i) {
-			Elf_Shdr *s = &info->sechdrs[i];
-			const char *sname = info->secstrings + s->sh_name;
-
-			if ((s->sh_flags & masks[m][0]) != masks[m][0]
-			    || (s->sh_flags & masks[m][1])
-			    || s->sh_entsize != ~0UL
-			    || module_init_layout_section(sname))
-				continue;
-			s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
-			pr_debug("\t%s\n", sname);
-		}
-		switch (m) {
-		case 0: /* executable */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
-			mod->core_layout.text_size = mod->core_layout.size;
-			break;
-		case 1: /* RO: text and ro-data */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
-			mod->core_layout.ro_size = mod->core_layout.size;
-			break;
-		case 2: /* RO after init */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
-			mod->core_layout.ro_after_init_size = mod->core_layout.size;
-			break;
-		case 4: /* whole core */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
-			break;
-		}
-	}
-
-	pr_debug("Init section allocation order:\n");
-	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
-		for (i = 0; i < info->hdr->e_shnum; ++i) {
-			Elf_Shdr *s = &info->sechdrs[i];
-			const char *sname = info->secstrings + s->sh_name;
-
-			if ((s->sh_flags & masks[m][0]) != masks[m][0]
-			    || (s->sh_flags & masks[m][1])
-			    || s->sh_entsize != ~0UL
-			    || !module_init_layout_section(sname))
-				continue;
-			s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
-					 | INIT_OFFSET_MASK);
-			pr_debug("\t%s\n", sname);
-		}
-		switch (m) {
-		case 0: /* executable */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
-			mod->init_layout.text_size = mod->init_layout.size;
-			break;
-		case 1: /* RO: text and ro-data */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
-			mod->init_layout.ro_size = mod->init_layout.size;
-			break;
-		case 2:
-			/*
-			 * RO after init doesn't apply to init_layout (only
-			 * core_layout), so it just takes the value of ro_size.
-			 */
-			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
-			break;
-		case 4: /* whole init */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
-			break;
-		}
-	}
-}
-
-static void set_license(struct module *mod, const char *license)
-{
-	if (!license)
-		license = "unspecified";
-
-	if (!license_is_gpl_compatible(license)) {
-		if (!test_taint(TAINT_PROPRIETARY_MODULE))
-			pr_warn("%s: module license '%s' taints kernel.\n",
-				mod->name, license);
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
-				 LOCKDEP_NOW_UNRELIABLE);
-	}
-}
-
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
-	/* Skip non-zero chars */
-	while (string[0]) {
-		string++;
-		if ((*secsize)-- <= 1)
-			return NULL;
-	}
-
-	/* Skip any zero padding. */
-	while (!string[0]) {
-		string++;
-		if ((*secsize)-- <= 1)
-			return NULL;
-	}
-	return string;
-}
-
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
-			      char *prev)
-{
-	char *p;
-	unsigned int taglen = strlen(tag);
-	Elf_Shdr *infosec = &info->sechdrs[info->index.info];
-	unsigned long size = infosec->sh_size;
-
-	/*
-	 * get_modinfo() calls made before rewrite_section_headers()
-	 * must use sh_offset, as sh_addr isn't set!
-	 */
-	char *modinfo = (char *)info->hdr + infosec->sh_offset;
-
-	if (prev) {
-		size -= prev - modinfo;
-		modinfo = next_string(prev, &size);
-	}
-
-	for (p = modinfo; p; p = next_string(p, &size)) {
-		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
-			return p + taglen + 1;
-	}
-	return NULL;
-}
-
-static char *get_modinfo(const struct load_info *info, const char *tag)
-{
-	return get_next_modinfo(info, tag, NULL);
-}
-
-static void setup_modinfo(struct module *mod, struct load_info *info)
-{
-	struct module_attribute *attr;
-	int i;
-
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
-		if (attr->setup)
-			attr->setup(mod, get_modinfo(info, attr->attr.name));
-	}
-}
-
-static void free_modinfo(struct module *mod)
-{
-	struct module_attribute *attr;
-	int i;
-
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
-		if (attr->free)
-			attr->free(mod);
-	}
-}
-
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
-							  const struct kernel_symbol *start,
-							  const struct kernel_symbol *stop)
-{
-	return bsearch(name, start, stop - start,
-			sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
-		       const struct module *mod)
-{
-	const struct kernel_symbol *ks;
-	if (!mod)
-		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
-	else
-		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
-	return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
-	const Elf_Shdr *sechdrs = info->sechdrs;
-
-	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
-		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
-			return 'v';
-		else
-			return 'w';
-	}
-	if (sym->st_shndx == SHN_UNDEF)
-		return 'U';
-	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
-		return 'a';
-	if (sym->st_shndx >= SHN_LORESERVE)
-		return '?';
-	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
-		return 't';
-	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
-	    && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
-		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
-			return 'r';
-		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
-			return 'g';
-		else
-			return 'd';
-	}
-	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
-			return 's';
-		else
-			return 'b';
-	}
-	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
-		      ".debug")) {
-		return 'n';
-	}
-	return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
-			unsigned int shnum, unsigned int pcpundx)
-{
-	const Elf_Shdr *sec;
-
-	if (src->st_shndx == SHN_UNDEF
-	    || src->st_shndx >= shnum
-	    || !src->st_name)
-		return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
-	if (src->st_shndx == pcpundx)
-		return true;
-#endif
-
-	sec = sechdrs + src->st_shndx;
-	if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
-	    || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
-	    || (sec->sh_entsize & INIT_OFFSET_MASK))
-		return false;
-
-	return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep.  This is simple, but has the effect of making multiple
- * copies of duplicates.  We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
-	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
-	Elf_Shdr *strsect = info->sechdrs + info->index.str;
-	const Elf_Sym *src;
-	unsigned int i, nsrc, ndst, strtab_size = 0;
-
-	/* Put symbol section at end of init part of module. */
-	symsect->sh_flags |= SHF_ALLOC;
-	symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
-					 info->index.sym) | INIT_OFFSET_MASK;
-	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
-	src = (void *)info->hdr + symsect->sh_offset;
-	nsrc = symsect->sh_size / sizeof(*src);
-
-	/* Compute total space required for the core symbols' strtab. */
-	for (ndst = i = 0; i < nsrc; i++) {
-		if (i == 0 || is_livepatch_module(mod) ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
-				   info->index.pcpu)) {
-			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
-			ndst++;
-		}
-	}
-
-	/* Append room for core symbols at end of core part. */
-	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
-	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
-	mod->core_layout.size += strtab_size;
-	info->core_typeoffs = mod->core_layout.size;
-	mod->core_layout.size += ndst * sizeof(char);
-	mod->core_layout.size = debug_align(mod->core_layout.size);
-
-	/* Put string table section at end of init part of module. */
-	strsect->sh_flags |= SHF_ALLOC;
-	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
-					 info->index.str) | INIT_OFFSET_MASK;
-	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
-	/* We'll tack temporary mod_kallsyms on the end. */
-	mod->init_layout.size = ALIGN(mod->init_layout.size,
-				      __alignof__(struct mod_kallsyms));
-	info->mod_kallsyms_init_off = mod->init_layout.size;
-	mod->init_layout.size += sizeof(struct mod_kallsyms);
-	info->init_typeoffs = mod->init_layout.size;
-	mod->init_layout.size += nsrc * sizeof(char);
-	mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section.  Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-	unsigned int i, ndst;
-	const Elf_Sym *src;
-	Elf_Sym *dst;
-	char *s;
-	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
-	/* Set up to point into init section. */
-	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
-	mod->kallsyms->symtab = (void *)symsec->sh_addr;
-	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
-	/* Make sure we get permanent strtab: don't use info->strtab. */
-	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
-	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
-	/*
-	 * Now populate the cut down core kallsyms for after init
-	 * and set types up while we still have access to sections.
-	 */
-	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
-	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
-	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
-	src = mod->kallsyms->symtab;
-	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
-		mod->kallsyms->typetab[i] = elf_type(src + i, info);
-		if (i == 0 || is_livepatch_module(mod) ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
-				   info->index.pcpu)) {
-			mod->core_kallsyms.typetab[ndst] =
-			    mod->kallsyms->typetab[i];
-			dst[ndst] = src[i];
-			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
-			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
-				     KSYM_NAME_LEN) + 1;
-		}
-	}
-	mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-	const Elf_Shdr *sechdr;
-	unsigned int i;
-
-	for (i = 0; i < info->hdr->e_shnum; i++) {
-		sechdr = &info->sechdrs[i];
-		if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
-		    !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
-					sechdr->sh_size))
-			break;
-	}
-}
-#else
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-}
-#endif
-
-static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
-{
-	if (!debug)
-		return;
-	ddebug_add_module(debug, num, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
-{
-	if (debug)
-		ddebug_remove_module(mod->name);
-}
-
-void * __weak module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
-}
-
-bool __weak module_init_section(const char *name)
-{
-	return strstarts(name, ".init");
-}
-
-bool __weak module_exit_section(const char *name)
-{
-	return strstarts(name, ".exit");
-}
-
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
-				 const struct load_info *info)
-{
-	unsigned int i;
-
-	/* only scan the sections containing data */
-	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		/* Scan all writable sections that's not executable */
-		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
-		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
-		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
-			continue;
-
-		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
-				   info->sechdrs[i].sh_size, GFP_KERNEL);
-	}
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
-					const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
-	int err = -ENODATA;
-	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
-	const char *reason;
-	const void *mod = info->hdr;
-	bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
-				       MODULE_INIT_IGNORE_VERMAGIC);
-	/*
-	 * Do not allow mangled modules as a module with version information
-	 * removed is no longer the module that was signed.
-	 */
-	if (!mangled_module &&
-	    info->len > markerlen &&
-	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
-		/* We truncate the module to discard the signature */
-		info->len -= markerlen;
-		err = mod_verify_sig(mod, info);
-		if (!err) {
-			info->sig_ok = true;
-			return 0;
-		}
-	}
-
-	/*
-	 * We don't permit modules to be loaded into the trusted kernels
-	 * without a valid signature on them, but if we're not enforcing,
-	 * certain errors are non-fatal.
-	 */
-	switch (err) {
-	case -ENODATA:
-		reason = "unsigned module";
-		break;
-	case -ENOPKG:
-		reason = "module with unsupported crypto";
-		break;
-	case -ENOKEY:
-		reason = "module with unavailable key";
-		break;
-
-	default:
-		/*
-		 * All other errors are fatal, including lack of memory,
-		 * unparseable signatures, and signature check failures --
-		 * even if signatures aren't required.
-		 */
-		return err;
-	}
-
-	if (is_module_sig_enforced()) {
-		pr_notice("Loading of %s is rejected\n", reason);
-		return -EKEYREJECTED;
-	}
-
-	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
-	return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
-static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
-{
-#if defined(CONFIG_64BIT)
-	unsigned long long secend;
-#else
-	unsigned long secend;
-#endif
-
-	/*
-	 * Check for both overflow and offset/size being
-	 * too large.
-	 */
-	secend = shdr->sh_offset + shdr->sh_size;
-	if (secend < shdr->sh_offset || secend > info->len)
-		return -ENOEXEC;
-
-	return 0;
-}
-
-/*
- * Sanity checks against invalid binaries, wrong arch, weird elf version.
- *
- * Also do basic validity checks against section offsets and sizes, the
- * section name string table, and the indices used for it (sh_name).
- */
-static int elf_validity_check(struct load_info *info)
-{
-	unsigned int i;
-	Elf_Shdr *shdr, *strhdr;
-	int err;
-
-	if (info->len < sizeof(*(info->hdr))) {
-		pr_err("Invalid ELF header len %lu\n", info->len);
-		goto no_exec;
-	}
-
-	if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
-		pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
-		goto no_exec;
-	}
-	if (info->hdr->e_type != ET_REL) {
-		pr_err("Invalid ELF header type: %u != %u\n",
-		       info->hdr->e_type, ET_REL);
-		goto no_exec;
-	}
-	if (!elf_check_arch(info->hdr)) {
-		pr_err("Invalid architecture in ELF header: %u\n",
-		       info->hdr->e_machine);
-		goto no_exec;
-	}
-	if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
-		pr_err("Invalid ELF section header size\n");
-		goto no_exec;
-	}
-
-	/*
-	 * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
-	 * known and small. So e_shnum * sizeof(Elf_Shdr)
-	 * will not overflow unsigned long on any platform.
-	 */
-	if (info->hdr->e_shoff >= info->len
-	    || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
-		info->len - info->hdr->e_shoff)) {
-		pr_err("Invalid ELF section header overflow\n");
-		goto no_exec;
-	}
-
-	info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
-
-	/*
-	 * Verify if the section name table index is valid.
-	 */
-	if (info->hdr->e_shstrndx == SHN_UNDEF
-	    || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
-		pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
-		       info->hdr->e_shstrndx, info->hdr->e_shstrndx,
-		       info->hdr->e_shnum);
-		goto no_exec;
-	}
-
-	strhdr = &info->sechdrs[info->hdr->e_shstrndx];
-	err = validate_section_offset(info, strhdr);
-	if (err < 0) {
-		pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
-		return err;
-	}
-
-	/*
-	 * The section name table must be NUL-terminated, as required
-	 * by the spec. This makes strcmp and pr_* calls that access
-	 * strings in the section safe.
-	 */
-	info->secstrings = (void *)info->hdr + strhdr->sh_offset;
-	if (info->secstrings[strhdr->sh_size - 1] != '\0') {
-		pr_err("ELF Spec violation: section name table isn't null terminated\n");
-		goto no_exec;
-	}
-
-	/*
-	 * The code assumes that section 0 has a length of zero and
-	 * an addr of zero, so check for it.
-	 */
-	if (info->sechdrs[0].sh_type != SHT_NULL
-	    || info->sechdrs[0].sh_size != 0
-	    || info->sechdrs[0].sh_addr != 0) {
-		pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
-		       info->sechdrs[0].sh_type);
-		goto no_exec;
-	}
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		shdr = &info->sechdrs[i];
-		switch (shdr->sh_type) {
-		case SHT_NULL:
-		case SHT_NOBITS:
-			continue;
-		case SHT_SYMTAB:
-			if (shdr->sh_link == SHN_UNDEF
-			    || shdr->sh_link >= info->hdr->e_shnum) {
-				pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
-				       shdr->sh_link, shdr->sh_link,
-				       info->hdr->e_shnum);
-				goto no_exec;
-			}
-			fallthrough;
-		default:
-			err = validate_section_offset(info, shdr);
-			if (err < 0) {
-				pr_err("Invalid ELF section in module (section %u type %u)\n",
-					i, shdr->sh_type);
-				return err;
-			}
-
-			if (shdr->sh_flags & SHF_ALLOC) {
-				if (shdr->sh_name >= strhdr->sh_size) {
-					pr_err("Invalid ELF section name in module (section %u type %u)\n",
-					       i, shdr->sh_type);
-					return -ENOEXEC;
-				}
-			}
-			break;
-		}
-	}
-
-	return 0;
-
-no_exec:
-	return -ENOEXEC;
-}
-
-#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
-
-static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
-{
-	do {
-		unsigned long n = min(len, COPY_CHUNK_SIZE);
-
-		if (copy_from_user(dst, usrc, n) != 0)
-			return -EFAULT;
-		cond_resched();
-		dst += n;
-		usrc += n;
-		len -= n;
-	} while (len);
-	return 0;
-}
-
-#ifdef CONFIG_LIVEPATCH
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
-	if (get_modinfo(info, "livepatch")) {
-		mod->klp = true;
-		add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
-		pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
-			       mod->name);
-	}
-
-	return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
-	if (get_modinfo(info, "livepatch")) {
-		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
-		       mod->name);
-		return -ENOEXEC;
-	}
-
-	return 0;
-}
-#endif /* CONFIG_LIVEPATCH */
-
-static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
-{
-	if (retpoline_module_ok(get_modinfo(info, "retpoline")))
-		return;
-
-	pr_warn("%s: loading module not compiled with retpoline compiler.\n",
-		mod->name);
-}
-
-/* Sets info->hdr and info->len. */
-static int copy_module_from_user(const void __user *umod, unsigned long len,
-				  struct load_info *info)
-{
-	int err;
-
-	info->len = len;
-	if (info->len < sizeof(*(info->hdr)))
-		return -ENOEXEC;
-
-	err = security_kernel_load_data(LOADING_MODULE, true);
-	if (err)
-		return err;
-
-	/* Suck in entire file: we'll want most of it. */
-	info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
-	if (!info->hdr)
-		return -ENOMEM;
-
-	if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	err = security_kernel_post_load_data((char *)info->hdr, info->len,
-					     LOADING_MODULE, "init_module");
-out:
-	if (err)
-		vfree(info->hdr);
-
-	return err;
-}
-
-static void free_copy(struct load_info *info, int flags)
-{
-	if (flags & MODULE_INIT_COMPRESSED_FILE)
-		module_decompress_cleanup(info);
-	else
-		vfree(info->hdr);
-}
-
-static int rewrite_section_headers(struct load_info *info, int flags)
-{
-	unsigned int i;
-
-	/* This should always be true, but let's be sure. */
-	info->sechdrs[0].sh_addr = 0;
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		Elf_Shdr *shdr = &info->sechdrs[i];
-
-		/*
-		 * Mark all sections sh_addr with their address in the
-		 * temporary image.
-		 */
-		shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-
-	}
-
-	/* Track but don't keep modinfo and version sections. */
-	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
-	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
-	return 0;
-}
-
-/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Set info->mod to the temporary copy of the module in info->hdr. The final one
- * will be allocated in move_module().
- */
-static int setup_load_info(struct load_info *info, int flags)
-{
-	unsigned int i;
-
-	/* Try to find a name early so we can log errors with a module name */
-	info->index.info = find_sec(info, ".modinfo");
-	if (info->index.info)
-		info->name = get_modinfo(info, "name");
-
-	/* Find internal symbols and strings. */
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
-			info->index.sym = i;
-			info->index.str = info->sechdrs[i].sh_link;
-			info->strtab = (char *)info->hdr
-				+ info->sechdrs[info->index.str].sh_offset;
-			break;
-		}
-	}
-
-	if (info->index.sym == 0) {
-		pr_warn("%s: module has no symbols (stripped?)\n",
-			info->name ?: "(missing .modinfo section or name field)");
-		return -ENOEXEC;
-	}
-
-	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
-	if (!info->index.mod) {
-		pr_warn("%s: No module found in object\n",
-			info->name ?: "(missing .modinfo section or name field)");
-		return -ENOEXEC;
-	}
-	/* This is temporary: point mod into copy of data. */
-	info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
-
-	/*
-	 * If we didn't load the .modinfo 'name' field earlier, fall back to
-	 * on-disk struct mod 'name' field.
-	 */
-	if (!info->name)
-		info->name = info->mod->name;
-
-	if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
-		info->index.vers = 0; /* Pretend no __versions section! */
-	else
-		info->index.vers = find_sec(info, "__versions");
-
-	info->index.pcpu = find_pcpusec(info);
-
-	return 0;
-}
-
-static int check_modinfo(struct module *mod, struct load_info *info, int flags)
-{
-	const char *modmagic = get_modinfo(info, "vermagic");
-	int err;
-
-	if (flags & MODULE_INIT_IGNORE_VERMAGIC)
-		modmagic = NULL;
-
-	/* This is allowed: modprobe --force will invalidate it. */
-	if (!modmagic) {
-		err = try_to_force_load(mod, "bad vermagic");
-		if (err)
-			return err;
-	} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
-		pr_err("%s: version magic '%s' should be '%s'\n",
-		       info->name, modmagic, vermagic);
-		return -ENOEXEC;
-	}
-
-	if (!get_modinfo(info, "intree")) {
-		if (!test_taint(TAINT_OOT_MODULE))
-			pr_warn("%s: loading out-of-tree module taints kernel.\n",
-				mod->name);
-		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
-	}
-
-	check_modinfo_retpoline(mod, info);
-
-	if (get_modinfo(info, "staging")) {
-		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
-		pr_warn("%s: module is from the staging directory, the quality "
-			"is unknown, you have been warned.\n", mod->name);
-	}
-
-	err = check_modinfo_livepatch(mod, info);
-	if (err)
-		return err;
-
-	/* Set up license info based on the info section */
-	set_license(mod, get_modinfo(info, "license"));
-
-	return 0;
-}
-
-static int find_module_sections(struct module *mod, struct load_info *info)
-{
-	mod->kp = section_objs(info, "__param",
-			       sizeof(*mod->kp), &mod->num_kp);
-	mod->syms = section_objs(info, "__ksymtab",
-				 sizeof(*mod->syms), &mod->num_syms);
-	mod->crcs = section_addr(info, "__kcrctab");
-	mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
-				     sizeof(*mod->gpl_syms),
-				     &mod->num_gpl_syms);
-	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
-
-#ifdef CONFIG_CONSTRUCTORS
-	mod->ctors = section_objs(info, ".ctors",
-				  sizeof(*mod->ctors), &mod->num_ctors);
-	if (!mod->ctors)
-		mod->ctors = section_objs(info, ".init_array",
-				sizeof(*mod->ctors), &mod->num_ctors);
-	else if (find_sec(info, ".init_array")) {
-		/*
-		 * This shouldn't happen with same compiler and binutils
-		 * building all parts of the module.
-		 */
-		pr_warn("%s: has both .ctors and .init_array.\n",
-		       mod->name);
-		return -EINVAL;
-	}
-#endif
-
-	mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
-						&mod->noinstr_text_size);
-
-#ifdef CONFIG_TRACEPOINTS
-	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
-					     sizeof(*mod->tracepoints_ptrs),
-					     &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_TREE_SRCU
-	mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
-					     sizeof(*mod->srcu_struct_ptrs),
-					     &mod->num_srcu_structs);
-#endif
-#ifdef CONFIG_BPF_EVENTS
-	mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
-					   sizeof(*mod->bpf_raw_events),
-					   &mod->num_bpf_raw_events);
-#endif
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-	mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
-#endif
-#ifdef CONFIG_JUMP_LABEL
-	mod->jump_entries = section_objs(info, "__jump_table",
-					sizeof(*mod->jump_entries),
-					&mod->num_jump_entries);
-#endif
-#ifdef CONFIG_EVENT_TRACING
-	mod->trace_events = section_objs(info, "_ftrace_events",
-					 sizeof(*mod->trace_events),
-					 &mod->num_trace_events);
-	mod->trace_evals = section_objs(info, "_ftrace_eval_map",
-					sizeof(*mod->trace_evals),
-					&mod->num_trace_evals);
-#endif
-#ifdef CONFIG_TRACING
-	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
-					 sizeof(*mod->trace_bprintk_fmt_start),
-					 &mod->num_trace_bprintk_fmt);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
-	/* sechdrs[0].sh_size is always zero */
-	mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
-					     sizeof(*mod->ftrace_callsites),
-					     &mod->num_ftrace_callsites);
-#endif
-#ifdef CONFIG_FUNCTION_ERROR_INJECTION
-	mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
-					    sizeof(*mod->ei_funcs),
-					    &mod->num_ei_funcs);
-#endif
-#ifdef CONFIG_KPROBES
-	mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
-						&mod->kprobes_text_size);
-	mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
-						sizeof(unsigned long),
-						&mod->num_kprobe_blacklist);
-#endif
-#ifdef CONFIG_PRINTK_INDEX
-	mod->printk_index_start = section_objs(info, ".printk_index",
-					       sizeof(*mod->printk_index_start),
-					       &mod->printk_index_size);
-#endif
-#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
-	mod->static_call_sites = section_objs(info, ".static_call_sites",
-					      sizeof(*mod->static_call_sites),
-					      &mod->num_static_call_sites);
-#endif
-	mod->extable = section_objs(info, "__ex_table",
-				    sizeof(*mod->extable), &mod->num_exentries);
-
-	if (section_addr(info, "__obsparm"))
-		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
-
-	info->debug = section_objs(info, "__dyndbg",
-				   sizeof(*info->debug), &info->num_debug);
-
-	return 0;
-}
-
-static int move_module(struct module *mod, struct load_info *info)
-{
-	int i;
-	void *ptr;
-
-	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
-	/*
-	 * The pointer to this block is stored in the module structure
-	 * which is inside the block. Just mark it as not being a
-	 * leak.
-	 */
-	kmemleak_not_leak(ptr);
-	if (!ptr)
-		return -ENOMEM;
-
-	memset(ptr, 0, mod->core_layout.size);
-	mod->core_layout.base = ptr;
-
-	if (mod->init_layout.size) {
-		ptr = module_alloc(mod->init_layout.size);
-		/*
-		 * The pointer to this block is stored in the module structure
-		 * which is inside the block. This block doesn't need to be
-		 * scanned as it contains data and code that will be freed
-		 * after the module is initialized.
-		 */
-		kmemleak_ignore(ptr);
-		if (!ptr) {
-			module_memfree(mod->core_layout.base);
-			return -ENOMEM;
-		}
-		memset(ptr, 0, mod->init_layout.size);
-		mod->init_layout.base = ptr;
-	} else
-		mod->init_layout.base = NULL;
-
-	/* Transfer each section which specifies SHF_ALLOC */
-	pr_debug("final section addresses:\n");
-	for (i = 0; i < info->hdr->e_shnum; i++) {
-		void *dest;
-		Elf_Shdr *shdr = &info->sechdrs[i];
-
-		if (!(shdr->sh_flags & SHF_ALLOC))
-			continue;
-
-		if (shdr->sh_entsize & INIT_OFFSET_MASK)
-			dest = mod->init_layout.base
-				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
-		else
-			dest = mod->core_layout.base + shdr->sh_entsize;
-
-		if (shdr->sh_type != SHT_NOBITS)
-			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
-		/* Update sh_addr to point to copy in image. */
-		shdr->sh_addr = (unsigned long)dest;
-		pr_debug("\t0x%lx %s\n",
-			 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
-	}
-
-	return 0;
-}
-
-static int check_module_license_and_versions(struct module *mod)
-{
-	int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
-
-	/*
-	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
-	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
-	 * using GPL-only symbols it needs.
-	 */
-	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
-	/* driverloader was caught wrongly pretending to be under GPL */
-	if (strcmp(mod->name, "driverloader") == 0)
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
-				 LOCKDEP_NOW_UNRELIABLE);
-
-	/* lve claims to be GPL but upstream won't provide source */
-	if (strcmp(mod->name, "lve") == 0)
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
-				 LOCKDEP_NOW_UNRELIABLE);
-
-	if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
-		pr_warn("%s: module license taints kernel.\n", mod->name);
-
-#ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !mod->crcs) ||
-	    (mod->num_gpl_syms && !mod->gpl_crcs)) {
-		return try_to_force_load(mod,
-					 "no versions for exported symbols");
-	}
-#endif
-	return 0;
-}
-
-static void flush_module_icache(const struct module *mod)
-{
-	/*
-	 * Flush the instruction cache, since we've played with text.
-	 * Do it before processing of module parameters, so the module
-	 * can provide parameter accessor functions of its own.
-	 */
-	if (mod->init_layout.base)
-		flush_icache_range((unsigned long)mod->init_layout.base,
-				   (unsigned long)mod->init_layout.base
-				   + mod->init_layout.size);
-	flush_icache_range((unsigned long)mod->core_layout.base,
-			   (unsigned long)mod->core_layout.base + mod->core_layout.size);
-}
-
-int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
-				     Elf_Shdr *sechdrs,
-				     char *secstrings,
-				     struct module *mod)
-{
-	return 0;
-}
-
-/* module_blacklist is a comma-separated list of module names */
-static char *module_blacklist;
-static bool blacklisted(const char *module_name)
-{
-	const char *p;
-	size_t len;
-
-	if (!module_blacklist)
-		return false;
-
-	for (p = module_blacklist; *p; p += len) {
-		len = strcspn(p, ",");
-		if (strlen(module_name) == len && !memcmp(module_name, p, len))
-			return true;
-		if (p[len] == ',')
-			len++;
-	}
-	return false;
-}
-core_param(module_blacklist, module_blacklist, charp, 0400);
-
-static struct module *layout_and_allocate(struct load_info *info, int flags)
-{
-	struct module *mod;
-	unsigned int ndx;
-	int err;
-
-	err = check_modinfo(info->mod, info, flags);
-	if (err)
-		return ERR_PTR(err);
-
-	/* Allow arches to frob section contents and sizes.  */
-	err = module_frob_arch_sections(info->hdr, info->sechdrs,
-					info->secstrings, info->mod);
-	if (err < 0)
-		return ERR_PTR(err);
-
-	err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
-					  info->secstrings, info->mod);
-	if (err < 0)
-		return ERR_PTR(err);
-
-	/* We will do a special allocation for per-cpu sections later. */
-	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
-	/*
-	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-	 * layout_sections() can put it in the right place.
-	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-	 */
-	ndx = find_sec(info, ".data..ro_after_init");
-	if (ndx)
-		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-	/*
-	 * Mark the __jump_table section as ro_after_init as well: these data
-	 * structures are never modified, with the exception of entries that
-	 * refer to code in the __init section, which are annotated as such
-	 * at module load time.
-	 */
-	ndx = find_sec(info, "__jump_table");
-	if (ndx)
-		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-
-	/*
-	 * Determine total sizes, and put offsets in sh_entsize.  For now
-	 * this is done generically; there doesn't appear to be any
-	 * special cases for the architectures.
-	 */
-	layout_sections(info->mod, info);
-	layout_symtab(info->mod, info);
-
-	/* Allocate and move to the final place */
-	err = move_module(info->mod, info);
-	if (err)
-		return ERR_PTR(err);
-
-	/* Module has been copied to its final place now: return it. */
-	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
-	kmemleak_load_module(mod, info);
-	return mod;
-}
-
-/* mod is no longer valid after this! */
-static void module_deallocate(struct module *mod, struct load_info *info)
-{
-	percpu_modfree(mod);
-	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
-}
-
-int __weak module_finalize(const Elf_Ehdr *hdr,
-			   const Elf_Shdr *sechdrs,
-			   struct module *me)
-{
-	return 0;
-}
-
-static int post_relocation(struct module *mod, const struct load_info *info)
-{
-	/* Sort exception table now relocations are done. */
-	sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
-	/* Copy relocated percpu area over. */
-	percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
-		       info->sechdrs[info->index.pcpu].sh_size);
-
-	/* Setup kallsyms-specific fields. */
-	add_kallsyms(mod, info);
-
-	/* Arch-specific module finalizing. */
-	return module_finalize(info->hdr, info->sechdrs, mod);
-}
-
-/* Is this module of this name done loading?  No locks held. */
-static bool finished_loading(const char *name)
-{
-	struct module *mod;
-	bool ret;
-
-	/*
-	 * The module_mutex should not be a heavily contended lock;
-	 * if we get the occasional sleep here, we'll go an extra iteration
-	 * in the wait_event_interruptible(), which is harmless.
-	 */
-	sched_annotate_sleep();
-	mutex_lock(&module_mutex);
-	mod = find_module_all(name, strlen(name), true);
-	ret = !mod || mod->state == MODULE_STATE_LIVE;
-	mutex_unlock(&module_mutex);
-
-	return ret;
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
-	unsigned long i;
-
-	for (i = 0; i < mod->num_ctors; i++)
-		mod->ctors[i]();
-#endif
-}
-
-/* For freeing module_init on success, in case kallsyms traversing */
-struct mod_initfree {
-	struct llist_node node;
-	void *module_init;
-};
-
-static void do_free_init(struct work_struct *w)
-{
-	struct llist_node *pos, *n, *list;
-	struct mod_initfree *initfree;
-
-	list = llist_del_all(&init_free_list);
-
-	synchronize_rcu();
-
-	llist_for_each_safe(pos, n, list) {
-		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->module_init);
-		kfree(initfree);
-	}
-}
-
-/*
- * This is where the real work happens.
- *
- * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
- * helper command 'lx-symbols'.
- */
-static noinline int do_init_module(struct module *mod)
-{
-	int ret = 0;
-	struct mod_initfree *freeinit;
-
-	freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
-	if (!freeinit) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-	freeinit->module_init = mod->init_layout.base;
-
-	do_mod_ctors(mod);
-	/* Start the module */
-	if (mod->init != NULL)
-		ret = do_one_initcall(mod->init);
-	if (ret < 0) {
-		goto fail_free_freeinit;
-	}
-	if (ret > 0) {
-		pr_warn("%s: '%s'->init suspiciously returned %d, it should "
-			"follow 0/-E convention\n"
-			"%s: loading module anyway...\n",
-			__func__, mod->name, ret, __func__);
-		dump_stack();
-	}
-
-	/* Now it's a first class citizen! */
-	mod->state = MODULE_STATE_LIVE;
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_LIVE, mod);
-
-	/* Delay uevent until module has finished its init routine */
-	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
-
-	/*
-	 * We need to finish all async code before the module init sequence
-	 * is done. This has potential to deadlock if synchronous module
-	 * loading is requested from async (which is not allowed!).
-	 *
-	 * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
-	 * request_module() from async workers") for more details.
-	 */
-	if (!mod->async_probe_requested)
-		async_synchronize_full();
-
-	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
-			mod->init_layout.size);
-	mutex_lock(&module_mutex);
-	/* Drop initial reference. */
-	module_put(mod);
-	trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
-	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
-	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
-#endif
-	module_enable_ro(mod, true);
-	mod_tree_remove_init(mod);
-	module_arch_freeing_init(mod);
-	mod->init_layout.base = NULL;
-	mod->init_layout.size = 0;
-	mod->init_layout.ro_size = 0;
-	mod->init_layout.ro_after_init_size = 0;
-	mod->init_layout.text_size = 0;
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-	/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
-	mod->btf_data = NULL;
-#endif
-	/*
-	 * We want to free module_init, but be aware that kallsyms may be
-	 * walking this with preempt disabled.  In all the failure paths, we
-	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path. module_memfree() cannot be called in an interrupt, so do the
-	 * work and call synchronize_rcu() in a work queue.
-	 *
-	 * Note that module_alloc() on most architectures creates W+X page
-	 * mappings which won't be cleaned up until do_free_init() runs.  Any
-	 * code such as mark_rodata_ro() which depends on those mappings to
-	 * be cleaned up needs to sync with the queued work - ie
-	 * rcu_barrier()
-	 */
-	if (llist_add(&freeinit->node, &init_free_list))
-		schedule_work(&init_free_wq);
-
-	mutex_unlock(&module_mutex);
-	wake_up_all(&module_wq);
-
-	return 0;
-
-fail_free_freeinit:
-	kfree(freeinit);
-fail:
-	/* Try to protect us from buggy refcounters. */
-	mod->state = MODULE_STATE_GOING;
-	synchronize_rcu();
-	module_put(mod);
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_GOING, mod);
-	klp_module_going(mod);
-	ftrace_release_mod(mod);
-	free_module(mod);
-	wake_up_all(&module_wq);
-	return ret;
-}
-
-static int may_init_module(void)
-{
-	if (!capable(CAP_SYS_MODULE) || modules_disabled)
-		return -EPERM;
-
-	return 0;
-}
-
-/*
- * We try to place it in the list now to make sure it's unique before
- * we dedicate too many resources.  In particular, temporary percpu
- * memory exhaustion.
- */
-static int add_unformed_module(struct module *mod)
-{
-	int err;
-	struct module *old;
-
-	mod->state = MODULE_STATE_UNFORMED;
-
-again:
-	mutex_lock(&module_mutex);
-	old = find_module_all(mod->name, strlen(mod->name), true);
-	if (old != NULL) {
-		if (old->state != MODULE_STATE_LIVE) {
-			/* Wait in case it fails to load. */
-			mutex_unlock(&module_mutex);
-			err = wait_event_interruptible(module_wq,
-					       finished_loading(mod->name));
-			if (err)
-				goto out_unlocked;
-			goto again;
-		}
-		err = -EEXIST;
-		goto out;
-	}
-	mod_update_bounds(mod);
-	list_add_rcu(&mod->list, &modules);
-	mod_tree_insert(mod);
-	err = 0;
-
-out:
-	mutex_unlock(&module_mutex);
-out_unlocked:
-	return err;
-}
-
-static int complete_formation(struct module *mod, struct load_info *info)
-{
-	int err;
-
-	mutex_lock(&module_mutex);
-
-	/* Find duplicate symbols (must be called under lock). */
-	err = verify_exported_symbols(mod);
-	if (err < 0)
-		goto out;
-
-	/* This relies on module_mutex for list integrity. */
-	module_bug_finalize(info->hdr, info->sechdrs, mod);
-
-	module_enable_ro(mod, false);
-	module_enable_nx(mod);
-	module_enable_x(mod);
-
-	/*
-	 * Mark state as coming so strong_try_module_get() ignores us,
-	 * but kallsyms etc. can see us.
-	 */
-	mod->state = MODULE_STATE_COMING;
-	mutex_unlock(&module_mutex);
-
-	return 0;
-
-out:
-	mutex_unlock(&module_mutex);
-	return err;
-}
-
-static int prepare_coming_module(struct module *mod)
-{
-	int err;
-
-	ftrace_module_enable(mod);
-	err = klp_module_coming(mod);
-	if (err)
-		return err;
-
-	err = blocking_notifier_call_chain_robust(&module_notify_list,
-			MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
-	err = notifier_to_errno(err);
-	if (err)
-		klp_module_going(mod);
-
-	return err;
-}
-
-static int unknown_module_param_cb(char *param, char *val, const char *modname,
-				   void *arg)
-{
-	struct module *mod = arg;
-	int ret;
-
-	if (strcmp(param, "async_probe") == 0) {
-		mod->async_probe_requested = true;
-		return 0;
-	}
-
-	/* Check for magic 'dyndbg' arg */
-	ret = ddebug_dyndbg_module_param_cb(param, val, modname);
-	if (ret != 0)
-		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
-	return 0;
-}
-
-static void cfi_init(struct module *mod);
-
-/*
- * Allocate and load the module: note that size of section 0 is always
- * zero, and we rely on this for optional sections.
- */
-static int load_module(struct load_info *info, const char __user *uargs,
-		       int flags)
-{
-	struct module *mod;
-	long err = 0;
-	char *after_dashes;
-
-	/*
-	 * Do the signature check (if any) first. All that
-	 * the signature check needs is info->len, it does
-	 * not need any of the section info. That can be
-	 * set up later. This will minimize the chances
-	 * of a corrupt module causing problems before
-	 * we even get to the signature check.
-	 *
-	 * The check will also adjust info->len by stripping
-	 * off the sig length at the end of the module, making
-	 * checks against info->len more correct.
-	 */
-	err = module_sig_check(info, flags);
-	if (err)
-		goto free_copy;
-
-	/*
-	 * Do basic sanity checks against the ELF header and
-	 * sections.
-	 */
-	err = elf_validity_check(info);
-	if (err)
-		goto free_copy;
-
-	/*
-	 * Everything checks out, so set up the section info
-	 * in the info structure.
-	 */
-	err = setup_load_info(info, flags);
-	if (err)
-		goto free_copy;
-
-	/*
-	 * Now that we know we have the correct module name, check
-	 * if it's blacklisted.
-	 */
-	if (blacklisted(info->name)) {
-		err = -EPERM;
-		pr_err("Module %s is blacklisted\n", info->name);
-		goto free_copy;
-	}
-
-	err = rewrite_section_headers(info, flags);
-	if (err)
-		goto free_copy;
-
-	/* Check module struct version now, before we try to use module. */
-	if (!check_modstruct_version(info, info->mod)) {
-		err = -ENOEXEC;
-		goto free_copy;
-	}
-
-	/* Figure out module layout, and allocate all the memory. */
-	mod = layout_and_allocate(info, flags);
-	if (IS_ERR(mod)) {
-		err = PTR_ERR(mod);
-		goto free_copy;
-	}
-
-	audit_log_kern_module(mod->name);
-
-	/* Reserve our place in the list. */
-	err = add_unformed_module(mod);
-	if (err)
-		goto free_module;
-
-#ifdef CONFIG_MODULE_SIG
-	mod->sig_ok = info->sig_ok;
-	if (!mod->sig_ok) {
-		pr_notice_once("%s: module verification failed: signature "
-			       "and/or required key missing - tainting "
-			       "kernel\n", mod->name);
-		add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
-	}
-#endif
-
-	/* To avoid stressing percpu allocator, do this once we're unique. */
-	err = percpu_modalloc(mod, info);
-	if (err)
-		goto unlink_mod;
-
-	/* Now module is in final location, initialize linked lists, etc. */
-	err = module_unload_init(mod);
-	if (err)
-		goto unlink_mod;
-
-	init_param_lock(mod);
-
-	/*
-	 * Now we've got everything in the final locations, we can
-	 * find optional sections.
-	 */
-	err = find_module_sections(mod, info);
-	if (err)
-		goto free_unload;
-
-	err = check_module_license_and_versions(mod);
-	if (err)
-		goto free_unload;
-
-	/* Set up MODINFO_ATTR fields */
-	setup_modinfo(mod, info);
-
-	/* Fix up syms, so that st_value is a pointer to location. */
-	err = simplify_symbols(mod, info);
-	if (err < 0)
-		goto free_modinfo;
-
-	err = apply_relocations(mod, info);
-	if (err < 0)
-		goto free_modinfo;
-
-	err = post_relocation(mod, info);
-	if (err < 0)
-		goto free_modinfo;
-
-	flush_module_icache(mod);
-
-	/* Setup CFI for the module. */
-	cfi_init(mod);
-
-	/* Now copy in args */
-	mod->args = strndup_user(uargs, ~0UL >> 1);
-	if (IS_ERR(mod->args)) {
-		err = PTR_ERR(mod->args);
-		goto free_arch_cleanup;
-	}
-
-	init_build_id(mod, info);
-	dynamic_debug_setup(mod, info->debug, info->num_debug);
-
-	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
-	ftrace_module_init(mod);
-
-	/* Finally it's fully formed, ready to start executing. */
-	err = complete_formation(mod, info);
-	if (err)
-		goto ddebug_cleanup;
-
-	err = prepare_coming_module(mod);
-	if (err)
-		goto bug_cleanup;
-
-	/* Module is ready to execute: parsing args may do that. */
-	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-				  -32768, 32767, mod,
-				  unknown_module_param_cb);
-	if (IS_ERR(after_dashes)) {
-		err = PTR_ERR(after_dashes);
-		goto coming_cleanup;
-	} else if (after_dashes) {
-		pr_warn("%s: parameters '%s' after `--' ignored\n",
-		       mod->name, after_dashes);
-	}
-
-	/* Link in to sysfs. */
-	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
-	if (err < 0)
-		goto coming_cleanup;
-
-	if (is_livepatch_module(mod)) {
-		err = copy_module_elf(mod, info);
-		if (err < 0)
-			goto sysfs_cleanup;
-	}
-
-	/* Get rid of temporary copy. */
-	free_copy(info, flags);
-
-	/* Done! */
-	trace_module_load(mod);
-
-	return do_init_module(mod);
-
- sysfs_cleanup:
-	mod_sysfs_teardown(mod);
- coming_cleanup:
-	mod->state = MODULE_STATE_GOING;
-	destroy_params(mod->kp, mod->num_kp);
-	blocking_notifier_call_chain(&module_notify_list,
-				     MODULE_STATE_GOING, mod);
-	klp_module_going(mod);
- bug_cleanup:
-	mod->state = MODULE_STATE_GOING;
-	/* module_bug_cleanup needs module_mutex protection */
-	mutex_lock(&module_mutex);
-	module_bug_cleanup(mod);
-	mutex_unlock(&module_mutex);
-
- ddebug_cleanup:
-	ftrace_release_mod(mod);
-	dynamic_debug_remove(mod, info->debug);
-	synchronize_rcu();
-	kfree(mod->args);
- free_arch_cleanup:
-	cfi_cleanup(mod);
-	module_arch_cleanup(mod);
- free_modinfo:
-	free_modinfo(mod);
- free_unload:
-	module_unload_free(mod);
- unlink_mod:
-	mutex_lock(&module_mutex);
-	/* Unlink carefully: kallsyms could be walking list. */
-	list_del_rcu(&mod->list);
-	mod_tree_remove(mod);
-	wake_up_all(&module_wq);
-	/* Wait for RCU-sched synchronizing before releasing mod->list. */
-	synchronize_rcu();
-	mutex_unlock(&module_mutex);
- free_module:
-	/* Free lock-classes; relies on the preceding sync_rcu() */
-	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
-
-	module_deallocate(mod, info);
- free_copy:
-	free_copy(info, flags);
-	return err;
-}
-
-SYSCALL_DEFINE3(init_module, void __user *, umod,
-		unsigned long, len, const char __user *, uargs)
-{
-	int err;
-	struct load_info info = { };
-
-	err = may_init_module();
-	if (err)
-		return err;
-
-	pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
-	       umod, len, uargs);
-
-	err = copy_module_from_user(umod, len, &info);
-	if (err)
-		return err;
-
-	return load_module(&info, uargs, 0);
-}
-
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
-{
-	struct load_info info = { };
-	void *buf = NULL;
-	int len;
-	int err;
-
-	err = may_init_module();
-	if (err)
-		return err;
-
-	pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
-
-	if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
-		      |MODULE_INIT_IGNORE_VERMAGIC
-		      |MODULE_INIT_COMPRESSED_FILE))
-		return -EINVAL;
-
-	len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
-				       READING_MODULE);
-	if (len < 0)
-		return len;
-
-	if (flags & MODULE_INIT_COMPRESSED_FILE) {
-		err = module_decompress(&info, buf, len);
-		vfree(buf); /* compressed data is no longer needed */
-		if (err)
-			return err;
-	} else {
-		info.hdr = buf;
-		info.len = len;
-	}
-
-	return load_module(&info, uargs, flags);
-}
-
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
-	return ((void *)addr >= start && (void *)addr < start + size);
-}
-
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
-	if (str[0] == '.' && str[1] == 'L')
-		return true;
-	return str[0] == '$' && strchr("axtd", str[1])
-	       && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
-	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
-					unsigned long addr,
-					unsigned long *size,
-					unsigned long *offset)
-{
-	unsigned int i, best = 0;
-	unsigned long nextval, bestval;
-	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
-	/* At worse, next value is at end of module */
-	if (within_module_init(addr, mod))
-		nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
-	else
-		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
-	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
-	/*
-	 * Scan for closest preceding symbol, and next symbol. (ELF
-	 * starts real symbols at 1).
-	 */
-	for (i = 1; i < kallsyms->num_symtab; i++) {
-		const Elf_Sym *sym = &kallsyms->symtab[i];
-		unsigned long thisval = kallsyms_symbol_value(sym);
-
-		if (sym->st_shndx == SHN_UNDEF)
-			continue;
-
-		/*
-		 * We ignore unnamed symbols: they're uninformative
-		 * and inserted at a whim.
-		 */
-		if (*kallsyms_symbol_name(kallsyms, i) == '\0'
-		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
-			continue;
-
-		if (thisval <= addr && thisval > bestval) {
-			best = i;
-			bestval = thisval;
-		}
-		if (thisval > addr && thisval < nextval)
-			nextval = thisval;
-	}
-
-	if (!best)
-		return NULL;
-
-	if (size)
-		*size = nextval - bestval;
-	if (offset)
-		*offset = addr - bestval;
-
-	return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
-						     void *ptr)
-{
-	return ptr;
-}
-
-/*
- * For kallsyms to ask for address resolution.  NULL means not found.  Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
- */
-const char *module_address_lookup(unsigned long addr,
-			    unsigned long *size,
-			    unsigned long *offset,
-			    char **modname,
-			    const unsigned char **modbuildid,
-			    char *namebuf)
-{
-	const char *ret = NULL;
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_address(addr);
-	if (mod) {
-		if (modname)
-			*modname = mod->name;
-		if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-			*modbuildid = mod->build_id;
-#else
-			*modbuildid = NULL;
-#endif
-		}
-
-		ret = find_kallsyms_symbol(mod, addr, size, offset);
-	}
-	/* Make a copy in here where it's safe */
-	if (ret) {
-		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
-		ret = namebuf;
-	}
-	preempt_enable();
-
-	return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod)) {
-			const char *sym;
-
-			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
-			if (!sym)
-				goto out;
-
-			strlcpy(symname, sym, KSYM_NAME_LEN);
-			preempt_enable();
-			return 0;
-		}
-	}
-out:
-	preempt_enable();
-	return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
-			unsigned long *offset, char *modname, char *name)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod)) {
-			const char *sym;
-
-			sym = find_kallsyms_symbol(mod, addr, size, offset);
-			if (!sym)
-				goto out;
-			if (modname)
-				strlcpy(modname, mod->name, MODULE_NAME_LEN);
-			if (name)
-				strlcpy(name, sym, KSYM_NAME_LEN);
-			preempt_enable();
-			return 0;
-		}
-	}
-out:
-	preempt_enable();
-	return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-			char *name, char *module_name, int *exported)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		struct mod_kallsyms *kallsyms;
-
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		kallsyms = rcu_dereference_sched(mod->kallsyms);
-		if (symnum < kallsyms->num_symtab) {
-			const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
-			*value = kallsyms_symbol_value(sym);
-			*type = kallsyms->typetab[symnum];
-			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
-			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
-			*exported = is_exported(name, *value, mod);
-			preempt_enable();
-			return 0;
-		}
-		symnum -= kallsyms->num_symtab;
-	}
-	preempt_enable();
-	return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
-	unsigned int i;
-	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
-	for (i = 0; i < kallsyms->num_symtab; i++) {
-		const Elf_Sym *sym = &kallsyms->symtab[i];
-
-		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
-		    sym->st_shndx != SHN_UNDEF)
-			return kallsyms_symbol_value(sym);
-	}
-	return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
-	struct module *mod;
-	char *colon;
-	unsigned long ret = 0;
-
-	/* Don't lock: we're in enough trouble already. */
-	preempt_disable();
-	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
-		if ((mod = find_module_all(name, colon - name, false)) != NULL)
-			ret = find_kallsyms_symbol_value(mod, colon+1);
-	} else {
-		list_for_each_entry_rcu(mod, &modules, list) {
-			if (mod->state == MODULE_STATE_UNFORMED)
-				continue;
-			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
-				break;
-		}
-	}
-	preempt_enable();
-	return ret;
-}
-
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
-				   void *data)
-{
-	struct module *mod;
-	unsigned int i;
-	int ret = 0;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(mod, &modules, list) {
-		/* We hold module_mutex: no need for rcu_dereference_sched */
-		struct mod_kallsyms *kallsyms = mod->kallsyms;
-
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		for (i = 0; i < kallsyms->num_symtab; i++) {
-			const Elf_Sym *sym = &kallsyms->symtab[i];
-
-			if (sym->st_shndx == SHN_UNDEF)
-				continue;
-
-			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
-				 mod, kallsyms_symbol_value(sym));
-			if (ret != 0)
-				goto out;
-
-			cond_resched();
-		}
-	}
-out:
-	mutex_unlock(&module_mutex);
-	return ret;
-}
-#endif /* CONFIG_LIVEPATCH */
-#endif /* CONFIG_KALLSYMS */
-
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
-	initcall_t *init;
-	exitcall_t *exit;
-
-	rcu_read_lock_sched();
-	mod->cfi_check = (cfi_check_fn)
-		find_kallsyms_symbol_value(mod, "__cfi_check");
-	init = (initcall_t *)
-		find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
-	exit = (exitcall_t *)
-		find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
-	rcu_read_unlock_sched();
-
-	/* Fix init/exit functions to point to the CFI jump table */
-	if (init)
-		mod->init = *init;
-#ifdef CONFIG_MODULE_UNLOAD
-	if (exit)
-		mod->exit = *exit;
-#endif
-
-	cfi_module_add(mod, module_addr_min);
-#endif
-}
-
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
-	cfi_module_remove(mod, module_addr_min);
-#endif
-}
-
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
-/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
-{
-	int bx = 0;
-
-	BUG_ON(mod->state == MODULE_STATE_UNFORMED);
-	if (mod->taints ||
-	    mod->state == MODULE_STATE_GOING ||
-	    mod->state == MODULE_STATE_COMING) {
-		buf[bx++] = '(';
-		bx += module_flags_taint(mod, buf + bx);
-		/* Show a - for module-is-being-unloaded */
-		if (mod->state == MODULE_STATE_GOING)
-			buf[bx++] = '-';
-		/* Show a + for module-is-being-loaded */
-		if (mod->state == MODULE_STATE_COMING)
-			buf[bx++] = '+';
-		buf[bx++] = ')';
-	}
-	buf[bx] = '\0';
-
-	return buf;
-}
-
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&module_mutex);
-	return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
-	struct module *mod = list_entry(p, struct module, list);
-	char buf[MODULE_FLAGS_BUF_SIZE];
-	void *value;
-
-	/* We always ignore unformed modules. */
-	if (mod->state == MODULE_STATE_UNFORMED)
-		return 0;
-
-	seq_printf(m, "%s %u",
-		   mod->name, mod->init_layout.size + mod->core_layout.size);
-	print_unload_info(m, mod);
-
-	/* Informative for users. */
-	seq_printf(m, " %s",
-		   mod->state == MODULE_STATE_GOING ? "Unloading" :
-		   mod->state == MODULE_STATE_COMING ? "Loading" :
-		   "Live");
-	/* Used by oprofile and other similar tools. */
-	value = m->private ? NULL : mod->core_layout.base;
-	seq_printf(m, " 0x%px", value);
-
-	/* Taints info */
-	if (mod->taints)
-		seq_printf(m, " %s", module_flags(mod, buf));
-
-	seq_puts(m, "\n");
-	return 0;
-}
-
-/*
- * Format: modulename size refcount deps address
- *
- * Where refcount is a number or -, and deps is a comma-separated list
- * of depends or -.
- */
-static const struct seq_operations modules_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
-	int err = seq_open(file, &modules_op);
-
-	if (!err) {
-		struct seq_file *m = file->private_data;
-		m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
-	}
-
-	return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
-	.proc_flags	= PROC_ENTRY_PERMANENT,
-	.proc_open	= modules_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
-	proc_create("modules", 0, NULL, &modules_proc_ops);
-	return 0;
-}
-module_init(proc_modules_init);
-#endif
-
-/* Given an address, look for it in the module exception tables. */
-const struct exception_table_entry *search_module_extables(unsigned long addr)
-{
-	const struct exception_table_entry *e = NULL;
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_address(addr);
-	if (!mod)
-		goto out;
-
-	if (!mod->num_exentries)
-		goto out;
-
-	e = search_extable(mod->extable,
-			   mod->num_exentries,
-			   addr);
-out:
-	preempt_enable();
-
-	/*
-	 * Now, if we found one, we are running inside it now, hence
-	 * we cannot unload the module, hence no refcnt needed.
-	 */
-	return e;
-}
-
-/**
- * is_module_address() - is this address inside a module?
- * @addr: the address to check.
- *
- * See is_module_text_address() if you simply want to see if the address
- * is code (not data).
- */
-bool is_module_address(unsigned long addr)
-{
-	bool ret;
-
-	preempt_disable();
-	ret = __module_address(addr) != NULL;
-	preempt_enable();
-
-	return ret;
-}
-
-/**
- * __module_address() - get the module which contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_address(unsigned long addr)
-{
-	struct module *mod;
-
-	if (addr < module_addr_min || addr > module_addr_max)
-		return NULL;
-
-	module_assert_mutex_or_preempt();
-
-	mod = mod_find(addr);
-	if (mod) {
-		BUG_ON(!within_module(addr, mod));
-		if (mod->state == MODULE_STATE_UNFORMED)
-			mod = NULL;
-	}
-	return mod;
-}
-
-/**
- * is_module_text_address() - is this address inside module code?
- * @addr: the address to check.
- *
- * See is_module_address() if you simply want to see if the address is
- * anywhere in a module.  See kernel_text_address() for testing if an
- * address corresponds to kernel or module code.
- */
-bool is_module_text_address(unsigned long addr)
-{
-	bool ret;
-
-	preempt_disable();
-	ret = __module_text_address(addr) != NULL;
-	preempt_enable();
-
-	return ret;
-}
-
-/**
- * __module_text_address() - get the module whose code contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_text_address(unsigned long addr)
-{
-	struct module *mod = __module_address(addr);
-	if (mod) {
-		/* Make sure it's within the text section. */
-		if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
-		    && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
-			mod = NULL;
-	}
-	return mod;
-}
-
-/* Don't grab lock, we're oopsing. */
-void print_modules(void)
-{
-	struct module *mod;
-	char buf[MODULE_FLAGS_BUF_SIZE];
-
-	printk(KERN_DEFAULT "Modules linked in:");
-	/* Most callers should already have preempt disabled, but make sure */
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
-	}
-	preempt_enable();
-	if (last_unloaded_module[0])
-		pr_cont(" [last unloaded: %s]", last_unloaded_module);
-	pr_cont("\n");
-}
-
-#ifdef CONFIG_MODVERSIONS
-/*
- * Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module.
- */
-void module_layout(struct module *mod,
-		   struct modversion_info *ver,
-		   struct kernel_param *kp,
-		   struct kernel_symbol *ks,
-		   struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..cdd5c61b8c7f
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+
+obj-y += main.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
new file mode 100644
index 000000000000..d14d6443225a
--- /dev/null
+++ b/kernel/module/decompress.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+
+static int module_extend_max_pages(struct load_info *info, unsigned int extent)
+{
+	struct page **new_pages;
+
+	new_pages = kvmalloc_array(info->max_pages + extent,
+				   sizeof(info->pages), GFP_KERNEL);
+	if (!new_pages)
+		return -ENOMEM;
+
+	memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
+	kvfree(info->pages);
+	info->pages = new_pages;
+	info->max_pages += extent;
+
+	return 0;
+}
+
+static struct page *module_get_next_page(struct load_info *info)
+{
+	struct page *page;
+	int error;
+
+	if (info->max_pages == info->used_pages) {
+		error = module_extend_max_pages(info, info->used_pages);
+		if (error)
+			return ERR_PTR(error);
+	}
+
+	page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	info->pages[info->used_pages++] = page;
+	return page;
+}
+
+#ifdef CONFIG_MODULE_COMPRESS_GZIP
+#include <linux/zlib.h>
+#define MODULE_COMPRESSION	gzip
+#define MODULE_DECOMPRESS_FN	module_gzip_decompress
+
+/*
+ * Calculate length of the header which consists of signature, header
+ * flags, time stamp and operating system ID (10 bytes total), plus
+ * an optional filename.
+ */
+static size_t module_gzip_header_len(const u8 *buf, size_t size)
+{
+	const u8 signature[] = { 0x1f, 0x8b, 0x08 };
+	size_t len = 10;
+
+	if (size < len || memcmp(buf, signature, sizeof(signature)))
+		return 0;
+
+	if (buf[3] & 0x08) {
+		do {
+			/*
+			 * If we can't find the end of the file name we must
+			 * be dealing with a corrupted file.
+			 */
+			if (len == size)
+				return 0;
+		} while (buf[len++] != '\0');
+	}
+
+	return len;
+}
+
+static ssize_t module_gzip_decompress(struct load_info *info,
+				      const void *buf, size_t size)
+{
+	struct z_stream_s s = { 0 };
+	size_t new_size = 0;
+	size_t gzip_hdr_len;
+	ssize_t retval;
+	int rc;
+
+	gzip_hdr_len = module_gzip_header_len(buf, size);
+	if (!gzip_hdr_len) {
+		pr_err("not a gzip compressed module\n");
+		return -EINVAL;
+	}
+
+	s.next_in = buf + gzip_hdr_len;
+	s.avail_in = size - gzip_hdr_len;
+
+	s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	if (!s.workspace)
+		return -ENOMEM;
+
+	rc = zlib_inflateInit2(&s, -MAX_WBITS);
+	if (rc != Z_OK) {
+		pr_err("failed to initialize decompressor: %d\n", rc);
+		retval = -EINVAL;
+		goto out;
+	}
+
+	do {
+		struct page *page = module_get_next_page(info);
+		if (!page) {
+			retval = -ENOMEM;
+			goto out_inflate_end;
+		}
+
+		s.next_out = kmap(page);
+		s.avail_out = PAGE_SIZE;
+		rc = zlib_inflate(&s, 0);
+		kunmap(page);
+
+		new_size += PAGE_SIZE - s.avail_out;
+	} while (rc == Z_OK);
+
+	if (rc != Z_STREAM_END) {
+		pr_err("decompression failed with status %d\n", rc);
+		retval = -EINVAL;
+		goto out_inflate_end;
+	}
+
+	retval = new_size;
+
+out_inflate_end:
+	zlib_inflateEnd(&s);
+out:
+	kfree(s.workspace);
+	return retval;
+}
+#elif CONFIG_MODULE_COMPRESS_XZ
+#include <linux/xz.h>
+#define MODULE_COMPRESSION	xz
+#define MODULE_DECOMPRESS_FN	module_xz_decompress
+
+static ssize_t module_xz_decompress(struct load_info *info,
+				    const void *buf, size_t size)
+{
+	static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 };
+	struct xz_dec *xz_dec;
+	struct xz_buf xz_buf;
+	enum xz_ret xz_ret;
+	size_t new_size = 0;
+	ssize_t retval;
+
+	if (size < sizeof(signature) ||
+	    memcmp(buf, signature, sizeof(signature))) {
+		pr_err("not an xz compressed module\n");
+		return -EINVAL;
+	}
+
+	xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1);
+	if (!xz_dec)
+		return -ENOMEM;
+
+	xz_buf.in_size = size;
+	xz_buf.in = buf;
+	xz_buf.in_pos = 0;
+
+	do {
+		struct page *page = module_get_next_page(info);
+		if (!page) {
+			retval = -ENOMEM;
+			goto out;
+		}
+
+		xz_buf.out = kmap(page);
+		xz_buf.out_pos = 0;
+		xz_buf.out_size = PAGE_SIZE;
+		xz_ret = xz_dec_run(xz_dec, &xz_buf);
+		kunmap(page);
+
+		new_size += xz_buf.out_pos;
+	} while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
+
+	if (xz_ret != XZ_STREAM_END) {
+		pr_err("decompression failed with status %d\n", xz_ret);
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = new_size;
+
+ out:
+	xz_dec_end(xz_dec);
+	return retval;
+}
+#else
+#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
+#endif
+
+int module_decompress(struct load_info *info, const void *buf, size_t size)
+{
+	unsigned int n_pages;
+	ssize_t data_size;
+	int error;
+
+	/*
+	 * Start with number of pages twice as big as needed for
+	 * compressed data.
+	 */
+	n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
+	error = module_extend_max_pages(info, n_pages);
+
+	data_size = MODULE_DECOMPRESS_FN(info, buf, size);
+	if (data_size < 0) {
+		error = data_size;
+		goto err;
+	}
+
+	info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL);
+	if (!info->hdr) {
+		error = -ENOMEM;
+		goto err;
+	}
+
+	info->len = data_size;
+	return 0;
+
+err:
+	module_decompress_cleanup(info);
+	return error;
+}
+
+void module_decompress_cleanup(struct load_info *info)
+{
+	int i;
+
+	if (info->hdr)
+		vunmap(info->hdr);
+
+	for (i = 0; i < info->used_pages; i++)
+		__free_page(info->pages[i]);
+
+	kvfree(info->pages);
+
+	info->pages = NULL;
+	info->max_pages = info->used_pages = 0;
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t compression_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
+}
+static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
+
+static int __init module_decompress_sysfs_init(void)
+{
+	int error;
+
+	error = sysfs_create_file(&module_kset->kobj,
+				  &module_compression_attr.attr);
+	if (error)
+		pr_warn("Failed to create 'compression' attribute");
+
+	return 0;
+}
+late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..8c381c99062f
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/elf.h>
+#include <asm/module.h>
+
+struct load_info {
+	const char *name;
+	/* pointer to module in temporary copy, freed at end of load_module() */
+	struct module *mod;
+	Elf_Ehdr *hdr;
+	unsigned long len;
+	Elf_Shdr *sechdrs;
+	char *secstrings, *strtab;
+	unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+	struct _ddebug *debug;
+	unsigned int num_debug;
+	bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+	unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+	struct page **pages;
+	unsigned int max_pages;
+	unsigned int used_pages;
+#endif
+	struct {
+		unsigned int sym, str, mod, vers, info, pcpu;
+	} index;
+};
+
+extern int mod_verify_sig(const void *mod, struct load_info *info);
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+				    const void *buf, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
new file mode 100644
index 000000000000..1a17f02f69a0
--- /dev/null
+++ b/kernel/module/main.c
@@ -0,0 +1,4810 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ */
+
+#define INCLUDE_VERMAGIC
+
+#include <linux/export.h>
+#include <linux/extable.h>
+#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
+#include <linux/trace_events.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <linux/set_memory.h>
+#include <asm/mmu_context.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/livepatch.h>
+#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
+ */
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
+ * (delete and add uses RCU list operations).
+ */
+static DEFINE_MUTEX(module_mutex);
+static LIST_HEAD(modules);
+
+/* Work queue for freeing init sections in success case */
+static void do_free_init(struct work_struct *w);
+static DECLARE_WORK(init_free_wq, do_free_init);
+static LLIST_HEAD(init_free_list);
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+	return (unsigned long)layout->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+	return (unsigned long)layout->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+	return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long start, end;
+
+	start = __mod_tree_val(n);
+	if (val < start)
+		return -1;
+
+	end = start + __mod_tree_size(n);
+	if (val >= end)
+		return 1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+	.less = mod_tree_less,
+	.comp = mod_tree_comp,
+};
+
+static struct mod_tree_root {
+	struct latch_tree_root root;
+	unsigned long addr_min;
+	unsigned long addr_max;
+} mod_tree __cacheline_aligned = {
+	.addr_min = -1UL,
+};
+
+#define module_addr_min mod_tree.addr_min
+#define module_addr_max mod_tree.addr_max
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+	mod->core_layout.mtn.mod = mod;
+	mod->init_layout.mtn.mod = mod;
+
+	__mod_tree_insert(&mod->core_layout.mtn);
+	if (mod->init_layout.size)
+		__mod_tree_insert(&mod->init_layout.mtn);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	if (mod->init_layout.size)
+		__mod_tree_remove(&mod->init_layout.mtn);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	__mod_tree_remove(&mod->core_layout.mtn);
+	mod_tree_remove_init(mod);
+}
+
+static struct module *mod_find(unsigned long addr)
+{
+	struct latch_tree_node *ltn;
+
+	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+	if (!ltn)
+		return NULL;
+
+	return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
+#else /* MODULES_TREE_LOOKUP */
+
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+static void mod_tree_insert(struct module *mod) { }
+static void mod_tree_remove_init(struct module *mod) { }
+static void mod_tree_remove(struct module *mod) { }
+
+static struct module *mod_find(unsigned long addr)
+{
+	struct module *mod;
+
+	list_for_each_entry_rcu(mod, &modules, list,
+				lockdep_is_held(&module_mutex)) {
+		if (within_module(addr, mod))
+			return mod;
+	}
+
+	return NULL;
+}
+
+#endif /* MODULES_TREE_LOOKUP */
+
+/*
+ * Bounds of module text, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(void *base, unsigned int size)
+{
+	unsigned long min = (unsigned long)base;
+	unsigned long max = min + size;
+
+	if (min < module_addr_min)
+		module_addr_min = min;
+	if (max > module_addr_max)
+		module_addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+	if (mod->init_layout.size)
+		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
+}
+
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
+static void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+	if (unlikely(!debug_locks))
+		return;
+
+	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+		!lockdep_is_held(&module_mutex));
+#endif
+}
+
+#ifdef CONFIG_MODULE_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+void set_module_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+#else
+#define sig_enforce false
+#endif
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+	return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+int register_module_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
+static inline int strong_try_module_get(struct module *mod)
+{
+	BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+	if (mod && mod->state == MODULE_STATE_COMING)
+		return -EBUSY;
+	if (try_module_get(mod))
+		return 0;
+	else
+		return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag,
+				    enum lockdep_ok lockdep_ok)
+{
+	add_taint(flag, lockdep_ok);
+	set_bit(flag, &mod->taints);
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit.
+ */
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
+{
+	module_put(mod);
+	kthread_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(const struct load_info *info, const char *name)
+{
+	unsigned int i;
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
+		/* Alloc bit cleared means "ignore it." */
+		if ((shdr->sh_flags & SHF_ALLOC)
+		    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+			return i;
+	}
+	return 0;
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(const struct load_info *info, const char *name)
+{
+	/* Section 0 has sh_addr 0. */
+	return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+}
+
+/* Find a module section, or NULL.  Fill in number of "objects" in section. */
+static void *section_objs(const struct load_info *info,
+			  const char *name,
+			  size_t object_size,
+			  unsigned int *num)
+{
+	unsigned int sec = find_sec(info, name);
+
+	/* Section 0 has sh_addr 0 and sh_size 0. */
+	*num = info->sechdrs[sec].sh_size / object_size;
+	return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
+static unsigned int find_any_sec(const struct load_info *info, const char *name)
+{
+	unsigned int i;
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
+		if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * Find a module section, or NULL. Fill in number of "objects" in section.
+ * Ignores SHF_ALLOC flag.
+ */
+static __maybe_unused void *any_section_objs(const struct load_info *info,
+					     const char *name,
+					     size_t object_size,
+					     unsigned int *num)
+{
+	unsigned int sec = find_any_sec(info, name);
+
+	/* Section 0 has sh_addr 0 and sh_size 0. */
+	*num = info->sechdrs[sec].sh_size / object_size;
+	return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const s32 *crcs;
+	enum mod_license {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+	} license;
+};
+
+struct find_symbol_arg {
+	/* Input */
+	const char *name;
+	bool gplok;
+	bool warn;
+
+	/* Output */
+	struct module *owner;
+	const s32 *crc;
+	const struct kernel_symbol *sym;
+	enum mod_license license;
+};
+
+static bool check_exported_symbol(const struct symsearch *syms,
+				  struct module *owner,
+				  unsigned int symnum, void *data)
+{
+	struct find_symbol_arg *fsa = data;
+
+	if (!fsa->gplok && syms->license == GPL_ONLY)
+		return false;
+	fsa->owner = owner;
+	fsa->crc = symversion(syms->crcs, symnum);
+	fsa->sym = &syms->start[symnum];
+	fsa->license = syms->license;
+	return true;
+}
+
+static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+	return sym->value;
+#endif
+}
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	return offset_to_ptr(&sym->name_offset);
+#else
+	return sym->name;
+#endif
+}
+
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	if (!sym->namespace_offset)
+		return NULL;
+	return offset_to_ptr(&sym->namespace_offset);
+#else
+	return sym->namespace;
+#endif
+}
+
+static int cmp_name(const void *name, const void *sym)
+{
+	return strcmp(name, kernel_symbol_name(sym));
+}
+
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+					    struct module *owner,
+					    void *data)
+{
+	struct find_symbol_arg *fsa = data;
+	struct kernel_symbol *sym;
+
+	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+			sizeof(struct kernel_symbol), cmp_name);
+
+	if (sym != NULL && check_exported_symbol(syms, owner,
+						 sym - syms->start, data))
+		return true;
+
+	return false;
+}
+
+/*
+ * Find an exported symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it.  Needs preempt disabled or module_mutex.
+ */
+static bool find_symbol(struct find_symbol_arg *fsa)
+{
+	static const struct symsearch arr[] = {
+		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+		  NOT_GPL_ONLY },
+		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
+		  __start___kcrctab_gpl,
+		  GPL_ONLY },
+	};
+	struct module *mod;
+	unsigned int i;
+
+	module_assert_mutex_or_preempt();
+
+	for (i = 0; i < ARRAY_SIZE(arr); i++)
+		if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+			return true;
+
+	list_for_each_entry_rcu(mod, &modules, list,
+				lockdep_is_held(&module_mutex)) {
+		struct symsearch arr[] = {
+			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
+			  NOT_GPL_ONLY },
+			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+			  mod->gpl_crcs,
+			  GPL_ONLY },
+		};
+
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(arr); i++)
+			if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+				return true;
+	}
+
+	pr_debug("Failed to find symbol %s\n", fsa->name);
+	return false;
+}
+
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
+static struct module *find_module_all(const char *name, size_t len,
+				      bool even_unformed)
+{
+	struct module *mod;
+
+	module_assert_mutex_or_preempt();
+
+	list_for_each_entry_rcu(mod, &modules, list,
+				lockdep_is_held(&module_mutex)) {
+		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+			return mod;
+	}
+	return NULL;
+}
+
+struct module *find_module(const char *name)
+{
+	return find_module_all(name, strlen(name), false);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+	return mod->percpu;
+}
+
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+	Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+	unsigned long align = pcpusec->sh_addralign;
+
+	if (!pcpusec->sh_size)
+		return 0;
+
+	if (align > PAGE_SIZE) {
+		pr_warn("%s: per-cpu alignment %li > %li\n",
+			mod->name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
+	}
+
+	mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+	if (!mod->percpu) {
+		pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+			mod->name, (unsigned long)pcpusec->sh_size);
+		return -ENOMEM;
+	}
+	mod->percpu_size = pcpusec->sh_size;
+	return 0;
+}
+
+static void percpu_modfree(struct module *mod)
+{
+	free_percpu(mod->percpu);
+}
+
+static unsigned int find_pcpusec(struct load_info *info)
+{
+	return find_sec(info, ".data..percpu");
+}
+
+static void percpu_modcopy(struct module *mod,
+			   const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	struct module *mod;
+	unsigned int cpu;
+
+	preempt_disable();
+
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (!mod->percpu_size)
+			continue;
+		for_each_possible_cpu(cpu) {
+			void *start = per_cpu_ptr(mod->percpu, cpu);
+			void *va = (void *)addr;
+
+			if (va >= start && va < start + mod->percpu_size) {
+				if (can_addr) {
+					*can_addr = (unsigned long) (va - start);
+					*can_addr += (unsigned long)
+						per_cpu_ptr(mod->percpu,
+							    get_boot_cpu_id());
+				}
+				preempt_enable();
+				return true;
+			}
+		}
+	}
+
+	preempt_enable();
+	return false;
+}
+
+/**
+ * is_module_percpu_address() - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * Return: %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+	return __is_module_percpu_address(addr, NULL);
+}
+
+#else /* ... !CONFIG_SMP */
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+	return NULL;
+}
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+	/* UP modules shouldn't have this section: ENOMEM isn't quite right */
+	if (info->sechdrs[info->index.pcpu].sh_size != 0)
+		return -ENOMEM;
+	return 0;
+}
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
+{
+	return 0;
+}
+static inline void percpu_modcopy(struct module *mod,
+				  const void *from, unsigned long size)
+{
+	/* pcpusec should be 0, and size of that section should be 0. */
+	BUG_ON(size != 0);
+}
+bool is_module_percpu_address(unsigned long addr)
+{
+	return false;
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	return false;
+}
+
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field)	\
+static void setup_modinfo_##field(struct module *mod, const char *s)  \
+{                                                                     \
+	mod->field = kstrdup(s, GFP_KERNEL);                          \
+}                                                                     \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
+			struct module_kobject *mk, char *buffer)      \
+{                                                                     \
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
+}                                                                     \
+static int modinfo_##field##_exists(struct module *mod)               \
+{                                                                     \
+	return mod->field != NULL;                                    \
+}                                                                     \
+static void free_modinfo_##field(struct module *mod)                  \
+{                                                                     \
+	kfree(mod->field);                                            \
+	mod->field = NULL;                                            \
+}                                                                     \
+static struct module_attribute modinfo_##field = {                    \
+	.attr = { .name = __stringify(field), .mode = 0444 },         \
+	.show = show_modinfo_##field,                                 \
+	.setup = setup_modinfo_##field,                               \
+	.test = modinfo_##field##_exists,                             \
+	.free = free_modinfo_##field,                                 \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
+#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE	1
+
+/* Init the unload section of the module. */
+static int module_unload_init(struct module *mod)
+{
+	/*
+	 * Initialize reference counter to MODULE_REF_BASE.
+	 * refcnt == 0 means module is going.
+	 */
+	atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
+	INIT_LIST_HEAD(&mod->source_list);
+	INIT_LIST_HEAD(&mod->target_list);
+
+	/* Hold reference count during initialization. */
+	atomic_inc(&mod->refcnt);
+
+	return 0;
+}
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+	struct module_use *use;
+
+	list_for_each_entry(use, &b->source_list, source_list) {
+		if (use->source == a) {
+			pr_debug("%s uses %s!\n", a->name, b->name);
+			return 1;
+		}
+	}
+	pr_debug("%s does not use %s!\n", a->name, b->name);
+	return 0;
+}
+
+/*
+ * Module a uses b
+ *  - we add 'a' as a "source", 'b' as a "target" of module use
+ *  - the module_use is added to the list of 'b' sources (so
+ *    'b' can walk the list to see who sourced them), and of 'a'
+ *    targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+	struct module_use *use;
+
+	pr_debug("Allocating new usage for %s.\n", a->name);
+	use = kmalloc(sizeof(*use), GFP_ATOMIC);
+	if (!use)
+		return -ENOMEM;
+
+	use->source = a;
+	use->target = b;
+	list_add(&use->source_list, &b->source_list);
+	list_add(&use->target_list, &a->target_list);
+	return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+static int ref_module(struct module *a, struct module *b)
+{
+	int err;
+
+	if (b == NULL || already_uses(a, b))
+		return 0;
+
+	/* If module isn't available, we fail. */
+	err = strong_try_module_get(b);
+	if (err)
+		return err;
+
+	err = add_module_usage(a, b);
+	if (err) {
+		module_put(b);
+		return err;
+	}
+	return 0;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+	struct module_use *use, *tmp;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+		struct module *i = use->target;
+		pr_debug("%s unusing %s\n", mod->name, i->name);
+		module_put(i);
+		list_del(&use->source_list);
+		list_del(&use->target_list);
+		kfree(use);
+	}
+	mutex_unlock(&module_mutex);
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+	int ret = (flags & O_TRUNC);
+	if (ret)
+		add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+	return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
+{
+	int ret;
+
+	/* Try to decrement refcnt which we set at loading */
+	ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+	BUG_ON(ret < 0);
+	if (ret)
+		/* Someone can put this right now, recover with checking */
+		ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+	return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+	/* If it's not unused, quit unless we're forcing. */
+	if (try_release_module_ref(mod) != 0) {
+		*forced = try_force_unload(flags);
+		if (!(*forced))
+			return -EWOULDBLOCK;
+	}
+
+	/* Mark it as dying. */
+	mod->state = MODULE_STATE_GOING;
+
+	return 0;
+}
+
+/**
+ * module_refcount() - return the refcount or -1 if unloading
+ * @mod:	the module we're checking
+ *
+ * Return:
+ *	-1 if the module is in the process of unloading
+ *	otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
+{
+	return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+		unsigned int, flags)
+{
+	struct module *mod;
+	char name[MODULE_NAME_LEN];
+	int ret, forced = 0;
+
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
+		return -EPERM;
+
+	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+		return -EFAULT;
+	name[MODULE_NAME_LEN-1] = '\0';
+
+	audit_log_kern_module(name);
+
+	if (mutex_lock_interruptible(&module_mutex) != 0)
+		return -EINTR;
+
+	mod = find_module(name);
+	if (!mod) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (!list_empty(&mod->source_list)) {
+		/* Other modules depend on us: get rid of them first. */
+		ret = -EWOULDBLOCK;
+		goto out;
+	}
+
+	/* Doing init or already dying? */
+	if (mod->state != MODULE_STATE_LIVE) {
+		/* FIXME: if (force), slam module count damn the torpedoes */
+		pr_debug("%s already dying\n", mod->name);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* If it has an init func, it must have an exit func to unload */
+	if (mod->init && !mod->exit) {
+		forced = try_force_unload(flags);
+		if (!forced) {
+			/* This module can't be removed */
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	ret = try_stop_module(mod, flags, &forced);
+	if (ret != 0)
+		goto out;
+
+	mutex_unlock(&module_mutex);
+	/* Final destruction now no one is using it. */
+	if (mod->exit != NULL)
+		mod->exit();
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_GOING, mod);
+	klp_module_going(mod);
+	ftrace_release_mod(mod);
+
+	async_synchronize_full();
+
+	/* Store the name of the last unloaded module for diagnostic purposes */
+	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+
+	free_module(mod);
+	/* someone could wait for the module in add_unformed_module() */
+	wake_up_all(&module_wq);
+	return 0;
+out:
+	mutex_unlock(&module_mutex);
+	return ret;
+}
+
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	struct module_use *use;
+	int printed_something = 0;
+
+	seq_printf(m, " %i ", module_refcount(mod));
+
+	/*
+	 * Always include a trailing , so userspace can differentiate
+	 * between this and the old multi-field proc format.
+	 */
+	list_for_each_entry(use, &mod->source_list, source_list) {
+		printed_something = 1;
+		seq_printf(m, "%s,", use->source->name);
+	}
+
+	if (mod->init != NULL && mod->exit == NULL) {
+		printed_something = 1;
+		seq_puts(m, "[permanent],");
+	}
+
+	if (!printed_something)
+		seq_puts(m, "-");
+}
+
+void __symbol_put(const char *symbol)
+{
+	struct find_symbol_arg fsa = {
+		.name	= symbol,
+		.gplok	= true,
+	};
+
+	preempt_disable();
+	BUG_ON(!find_symbol(&fsa));
+	module_put(fsa.owner);
+	preempt_enable();
+}
+EXPORT_SYMBOL(__symbol_put);
+
+/* Note this assumes addr is a function, which it currently always is. */
+void symbol_put_addr(void *addr)
+{
+	struct module *modaddr;
+	unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+
+	if (core_kernel_text(a))
+		return;
+
+	/*
+	 * Even though we hold a reference on the module; we still need to
+	 * disable preemption in order to safely traverse the data structure.
+	 */
+	preempt_disable();
+	modaddr = __module_text_address(a);
+	BUG_ON(!modaddr);
+	module_put(modaddr);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+			   struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+}
+
+static struct module_attribute modinfo_refcnt =
+	__ATTR(refcnt, 0444, show_refcnt, NULL);
+
+void __module_get(struct module *module)
+{
+	if (module) {
+		preempt_disable();
+		atomic_inc(&module->refcnt);
+		trace_module_get(module, _RET_IP_);
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+	bool ret = true;
+
+	if (module) {
+		preempt_disable();
+		/* Note: here, we can fail to get a reference */
+		if (likely(module_is_live(module) &&
+			   atomic_inc_not_zero(&module->refcnt) != 0))
+			trace_module_get(module, _RET_IP_);
+		else
+			ret = false;
+
+		preempt_enable();
+	}
+	return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
+void module_put(struct module *module)
+{
+	int ret;
+
+	if (module) {
+		preempt_disable();
+		ret = atomic_dec_if_positive(&module->refcnt);
+		WARN_ON(ret < 0);	/* Failed to put refcount */
+		trace_module_put(module, _RET_IP_);
+		preempt_enable();
+	}
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	/* We don't know the usage count, or what modules are using. */
+	seq_puts(m, " - -");
+}
+
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static int ref_module(struct module *a, struct module *b)
+{
+	return strong_try_module_get(b);
+}
+
+static inline int module_unload_init(struct module *mod)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+	size_t l = 0;
+	int i;
+
+	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+		if (taint_flags[i].module && test_bit(i, &mod->taints))
+			buf[l++] = taint_flags[i].c_true;
+	}
+
+	return l;
+}
+
+static ssize_t show_initstate(struct module_attribute *mattr,
+			      struct module_kobject *mk, char *buffer)
+{
+	const char *state = "unknown";
+
+	switch (mk->mod->state) {
+	case MODULE_STATE_LIVE:
+		state = "live";
+		break;
+	case MODULE_STATE_COMING:
+		state = "coming";
+		break;
+	case MODULE_STATE_GOING:
+		state = "going";
+		break;
+	default:
+		BUG();
+	}
+	return sprintf(buffer, "%s\n", state);
+}
+
+static struct module_attribute modinfo_initstate =
+	__ATTR(initstate, 0444, show_initstate, NULL);
+
+static ssize_t store_uevent(struct module_attribute *mattr,
+			    struct module_kobject *mk,
+			    const char *buffer, size_t count)
+{
+	int rc;
+
+	rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+	return rc ? rc : count;
+}
+
+struct module_attribute module_uevent =
+	__ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
+}
+
+static struct module_attribute modinfo_coresize =
+	__ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
+}
+
+static struct module_attribute modinfo_initsize =
+	__ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+			  struct module_kobject *mk, char *buffer)
+{
+	size_t l;
+
+	l = module_flags_taint(mk->mod, buffer);
+	buffer[l++] = '\n';
+	return l;
+}
+
+static struct module_attribute modinfo_taint =
+	__ATTR(taint, 0444, show_taint, NULL);
+
+static struct module_attribute *modinfo_attrs[] = {
+	&module_uevent,
+	&modinfo_version,
+	&modinfo_srcversion,
+	&modinfo_initstate,
+	&modinfo_coresize,
+	&modinfo_initsize,
+	&modinfo_taint,
+#ifdef CONFIG_MODULE_UNLOAD
+	&modinfo_refcnt,
+#endif
+	NULL,
+};
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+static int try_to_force_load(struct module *mod, const char *reason)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+	if (!test_taint(TAINT_FORCED_MODULE))
+		pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+	add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+	return 0;
+#else
+	return -ENOEXEC;
+#endif
+}
+
+#ifdef CONFIG_MODVERSIONS
+
+static u32 resolve_rel_crc(const s32 *crc)
+{
+	return *(u32 *)((void *)crc + *crc);
+}
+
+static int check_version(const struct load_info *info,
+			 const char *symname,
+			 struct module *mod,
+			 const s32 *crc)
+{
+	Elf_Shdr *sechdrs = info->sechdrs;
+	unsigned int versindex = info->index.vers;
+	unsigned int i, num_versions;
+	struct modversion_info *versions;
+
+	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
+	if (!crc)
+		return 1;
+
+	/* No versions at all?  modprobe --force does this. */
+	if (versindex == 0)
+		return try_to_force_load(mod, symname) == 0;
+
+	versions = (void *) sechdrs[versindex].sh_addr;
+	num_versions = sechdrs[versindex].sh_size
+		/ sizeof(struct modversion_info);
+
+	for (i = 0; i < num_versions; i++) {
+		u32 crcval;
+
+		if (strcmp(versions[i].name, symname) != 0)
+			continue;
+
+		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+			crcval = resolve_rel_crc(crc);
+		else
+			crcval = *crc;
+		if (versions[i].crc == crcval)
+			return 1;
+		pr_debug("Found checksum %X vs module %lX\n",
+			 crcval, versions[i].crc);
+		goto bad_version;
+	}
+
+	/* Broken toolchain. Warn once, then let it go.. */
+	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+	return 1;
+
+bad_version:
+	pr_warn("%s: disagrees about version of symbol %s\n",
+	       info->name, symname);
+	return 0;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+					  struct module *mod)
+{
+	struct find_symbol_arg fsa = {
+		.name	= "module_layout",
+		.gplok	= true,
+	};
+
+	/*
+	 * Since this should be found in kernel (which can't be removed), no
+	 * locking is necessary -- use preempt_disable() to placate lockdep.
+	 */
+	preempt_disable();
+	if (!find_symbol(&fsa)) {
+		preempt_enable();
+		BUG();
+	}
+	preempt_enable();
+	return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+static inline int same_magic(const char *amagic, const char *bmagic,
+			     bool has_crcs)
+{
+	if (has_crcs) {
+		amagic += strcspn(amagic, " ");
+		bmagic += strcspn(bmagic, " ");
+	}
+	return strcmp(amagic, bmagic) == 0;
+}
+#else
+static inline int check_version(const struct load_info *info,
+				const char *symname,
+				struct module *mod,
+				const s32 *crc)
+{
+	return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+					  struct module *mod)
+{
+	return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic,
+			     bool has_crcs)
+{
+	return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
+
+static char *get_modinfo(const struct load_info *info, const char *tag);
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+			      char *prev);
+
+static int verify_namespace_is_imported(const struct load_info *info,
+					const struct kernel_symbol *sym,
+					struct module *mod)
+{
+	const char *namespace;
+	char *imported_namespace;
+
+	namespace = kernel_symbol_namespace(sym);
+	if (namespace && namespace[0]) {
+		imported_namespace = get_modinfo(info, "import_ns");
+		while (imported_namespace) {
+			if (strcmp(namespace, imported_namespace) == 0)
+				return 0;
+			imported_namespace = get_next_modinfo(
+				info, "import_ns", imported_namespace);
+		}
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+		pr_warn(
+#else
+		pr_err(
+#endif
+			"%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+			mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+		return -EINVAL;
+#endif
+	}
+	return 0;
+}
+
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+	if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+		return true;
+
+	if (mod->using_gplonly_symbols) {
+		pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+			mod->name, owner->name);
+		return false;
+	}
+
+	if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+		pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+			mod->name, owner->name);
+		set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+	}
+	return true;
+}
+
+/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+						  const struct load_info *info,
+						  const char *name,
+						  char ownername[])
+{
+	struct find_symbol_arg fsa = {
+		.name	= name,
+		.gplok	= !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+		.warn	= true,
+	};
+	int err;
+
+	/*
+	 * The module_mutex should not be a heavily contended lock;
+	 * if we get the occasional sleep here, we'll go an extra iteration
+	 * in the wait_event_interruptible(), which is harmless.
+	 */
+	sched_annotate_sleep();
+	mutex_lock(&module_mutex);
+	if (!find_symbol(&fsa))
+		goto unlock;
+
+	if (fsa.license == GPL_ONLY)
+		mod->using_gplonly_symbols = true;
+
+	if (!inherit_taint(mod, fsa.owner)) {
+		fsa.sym = NULL;
+		goto getname;
+	}
+
+	if (!check_version(info, name, mod, fsa.crc)) {
+		fsa.sym = ERR_PTR(-EINVAL);
+		goto getname;
+	}
+
+	err = verify_namespace_is_imported(info, fsa.sym, mod);
+	if (err) {
+		fsa.sym = ERR_PTR(err);
+		goto getname;
+	}
+
+	err = ref_module(mod, fsa.owner);
+	if (err) {
+		fsa.sym = ERR_PTR(err);
+		goto getname;
+	}
+
+getname:
+	/* We must make copy under the lock if we failed to get ref. */
+	strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
+unlock:
+	mutex_unlock(&module_mutex);
+	return fsa.sym;
+}
+
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+		    const struct load_info *info,
+		    const char *name)
+{
+	const struct kernel_symbol *ksym;
+	char owner[MODULE_NAME_LEN];
+
+	if (wait_event_interruptible_timeout(module_wq,
+			!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+			|| PTR_ERR(ksym) != -EBUSY,
+					     30 * HZ) <= 0) {
+		pr_warn("%s: gave up waiting for init of module %s.\n",
+			mod->name, owner);
+	}
+	return ksym;
+}
+
+#ifdef CONFIG_KALLSYMS
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#endif
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_SYSFS
+
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attr {
+	struct bin_attribute battr;
+	unsigned long address;
+};
+
+struct module_sect_attrs {
+	struct attribute_group grp;
+	unsigned int nsections;
+	struct module_sect_attr attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+				struct bin_attribute *battr,
+				char *buf, loff_t pos, size_t count)
+{
+	struct module_sect_attr *sattr =
+		container_of(battr, struct module_sect_attr, battr);
+	char bounce[MODULE_SECT_READ_SIZE + 1];
+	size_t wrote;
+
+	if (pos != 0)
+		return -EINVAL;
+
+	/*
+	 * Since we're a binary read handler, we must account for the
+	 * trailing NUL byte that sprintf will write: if "buf" is
+	 * too small to hold the NUL, or the NUL is exactly the last
+	 * byte, the read will look like it got truncated by one byte.
+	 * Since there is no way to ask sprintf nicely to not write
+	 * the NUL, we have to use a bounce buffer.
+	 */
+	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+			 kallsyms_show_value(file->f_cred)
+				? (void *)sattr->address : NULL);
+	count = min(count, wrote);
+	memcpy(buf, bounce, count);
+
+	return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+	unsigned int section;
+
+	for (section = 0; section < sect_attrs->nsections; section++)
+		kfree(sect_attrs->attrs[section].battr.attr.name);
+	kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+	unsigned int nloaded = 0, i, size[2];
+	struct module_sect_attrs *sect_attrs;
+	struct module_sect_attr *sattr;
+	struct bin_attribute **gattr;
+
+	/* Count loaded sections and allocate structures */
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]))
+			nloaded++;
+	size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+			sizeof(sect_attrs->grp.bin_attrs[0]));
+	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+	if (sect_attrs == NULL)
+		return;
+
+	/* Setup section attributes. */
+	sect_attrs->grp.name = "sections";
+	sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+
+	sect_attrs->nsections = 0;
+	sattr = &sect_attrs->attrs[0];
+	gattr = &sect_attrs->grp.bin_attrs[0];
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *sec = &info->sechdrs[i];
+		if (sect_empty(sec))
+			continue;
+		sysfs_bin_attr_init(&sattr->battr);
+		sattr->address = sec->sh_addr;
+		sattr->battr.attr.name =
+			kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+		if (sattr->battr.attr.name == NULL)
+			goto out;
+		sect_attrs->nsections++;
+		sattr->battr.read = module_sect_read;
+		sattr->battr.size = MODULE_SECT_READ_SIZE;
+		sattr->battr.attr.mode = 0400;
+		*(gattr++) = &(sattr++)->battr;
+	}
+	*gattr = NULL;
+
+	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+		goto out;
+
+	mod->sect_attrs = sect_attrs;
+	return;
+  out:
+	free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+	if (mod->sect_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->sect_attrs->grp);
+		/*
+		 * We are positive that no one is using any sect attrs
+		 * at this point.  Deallocate immediately.
+		 */
+		free_sect_attrs(mod->sect_attrs);
+		mod->sect_attrs = NULL;
+	}
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+	struct kobject *dir;
+	unsigned int notes;
+	struct bin_attribute attrs[];
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t pos, size_t count)
+{
+	/*
+	 * The caller checked the pos and count against our size.
+	 */
+	memcpy(buf, bin_attr->private + pos, count);
+	return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+			     unsigned int i)
+{
+	if (notes_attrs->dir) {
+		while (i-- > 0)
+			sysfs_remove_bin_file(notes_attrs->dir,
+					      &notes_attrs->attrs[i]);
+		kobject_put(notes_attrs->dir);
+	}
+	kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+	unsigned int notes, loaded, i;
+	struct module_notes_attrs *notes_attrs;
+	struct bin_attribute *nattr;
+
+	/* failed to create section attributes, so can't create notes */
+	if (!mod->sect_attrs)
+		return;
+
+	/* Count notes sections and allocate structures.  */
+	notes = 0;
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]) &&
+		    (info->sechdrs[i].sh_type == SHT_NOTE))
+			++notes;
+
+	if (notes == 0)
+		return;
+
+	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+			      GFP_KERNEL);
+	if (notes_attrs == NULL)
+		return;
+
+	notes_attrs->notes = notes;
+	nattr = &notes_attrs->attrs[0];
+	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+		if (sect_empty(&info->sechdrs[i]))
+			continue;
+		if (info->sechdrs[i].sh_type == SHT_NOTE) {
+			sysfs_bin_attr_init(nattr);
+			nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+			nattr->attr.mode = S_IRUGO;
+			nattr->size = info->sechdrs[i].sh_size;
+			nattr->private = (void *) info->sechdrs[i].sh_addr;
+			nattr->read = module_notes_read;
+			++nattr;
+		}
+		++loaded;
+	}
+
+	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+	if (!notes_attrs->dir)
+		goto out;
+
+	for (i = 0; i < notes; ++i)
+		if (sysfs_create_bin_file(notes_attrs->dir,
+					  &notes_attrs->attrs[i]))
+			goto out;
+
+	mod->notes_attrs = notes_attrs;
+	return;
+
+  out:
+	free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+	if (mod->notes_attrs)
+		free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else
+
+static inline void add_sect_attrs(struct module *mod,
+				  const struct load_info *info)
+{
+}
+
+static inline void remove_sect_attrs(struct module *mod)
+{
+}
+
+static inline void add_notes_attrs(struct module *mod,
+				   const struct load_info *info)
+{
+}
+
+static inline void remove_notes_attrs(struct module *mod)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list)
+		sysfs_remove_link(use->target->holders_dir, mod->name);
+	mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+	int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list) {
+		ret = sysfs_create_link(use->target->holders_dir,
+					&mod->mkobj.kobj, mod->name);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&module_mutex);
+	if (ret)
+		del_usage_links(mod);
+#endif
+	return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end);
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	struct module_attribute *temp_attr;
+	int error = 0;
+	int i;
+
+	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+					(ARRAY_SIZE(modinfo_attrs) + 1)),
+					GFP_KERNEL);
+	if (!mod->modinfo_attrs)
+		return -ENOMEM;
+
+	temp_attr = mod->modinfo_attrs;
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (!attr->test || attr->test(mod)) {
+			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			sysfs_attr_init(&temp_attr->attr);
+			error = sysfs_create_file(&mod->mkobj.kobj,
+					&temp_attr->attr);
+			if (error)
+				goto error_out;
+			++temp_attr;
+		}
+	}
+
+	return 0;
+
+error_out:
+	if (i > 0)
+		module_remove_modinfo_attrs(mod, --i);
+	else
+		kfree(mod->modinfo_attrs);
+	return error;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+		if (end >= 0 && i > end)
+			break;
+		/* pick a field to test for end of list */
+		if (!attr->attr.name)
+			break;
+		sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+		if (attr->free)
+			attr->free(mod);
+	}
+	kfree(mod->modinfo_attrs);
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+	DECLARE_COMPLETION_ONSTACK(c);
+	mod->mkobj.kobj_completion = &c;
+	kobject_put(&mod->mkobj.kobj);
+	wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+	int err;
+	struct kobject *kobj;
+
+	if (!module_sysfs_initialized) {
+		pr_err("%s: module sysfs not initialized\n", mod->name);
+		err = -EINVAL;
+		goto out;
+	}
+
+	kobj = kset_find_obj(module_kset, mod->name);
+	if (kobj) {
+		pr_err("%s: module is already loaded\n", mod->name);
+		kobject_put(kobj);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mod->mkobj.mod = mod;
+
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	mod->mkobj.kobj.kset = module_kset;
+	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+				   "%s", mod->name);
+	if (err)
+		mod_kobject_put(mod);
+
+out:
+	return err;
+}
+
+static int mod_sysfs_setup(struct module *mod,
+			   const struct load_info *info,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	int err;
+
+	err = mod_sysfs_init(mod);
+	if (err)
+		goto out;
+
+	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+	if (!mod->holders_dir) {
+		err = -ENOMEM;
+		goto out_unreg;
+	}
+
+	err = module_param_sysfs_setup(mod, kparam, num_params);
+	if (err)
+		goto out_unreg_holders;
+
+	err = module_add_modinfo_attrs(mod);
+	if (err)
+		goto out_unreg_param;
+
+	err = add_usage_links(mod);
+	if (err)
+		goto out_unreg_modinfo_attrs;
+
+	add_sect_attrs(mod, info);
+	add_notes_attrs(mod, info);
+
+	return 0;
+
+out_unreg_modinfo_attrs:
+	module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+	module_param_sysfs_remove(mod);
+out_unreg_holders:
+	kobject_put(mod->holders_dir);
+out_unreg:
+	mod_kobject_put(mod);
+out:
+	return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+	remove_notes_attrs(mod);
+	remove_sect_attrs(mod);
+	mod_kobject_put(mod);
+}
+
+static void init_param_lock(struct module *mod)
+{
+	mutex_init(&mod->param_lock);
+}
+#else /* !CONFIG_SYSFS */
+
+static int mod_sysfs_setup(struct module *mod,
+			   const struct load_info *info,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	return 0;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+}
+
+static void del_usage_links(struct module *mod)
+{
+}
+
+static void init_param_lock(struct module *mod)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static void mod_sysfs_teardown(struct module *mod)
+{
+	del_usage_links(mod);
+	module_remove_modinfo_attrs(mod, -1);
+	module_param_sysfs_remove(mod);
+	kobject_put(mod->mkobj.drivers_dir);
+	kobject_put(mod->holders_dir);
+	mod_sysfs_fini(mod);
+}
+
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ *
+ * General layout of module is:
+ *          [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^                ^               ^               ^
+ * ro_size ------------------------|               |               |
+ * ro_after_init_size -----------------------------|               |
+ * size -----------------------------------------------------------|
+ *
+ * These values are always page-aligned (as is base)
+ */
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
+ * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
+ * whether we are strict.
+ */
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+static void frob_text(const struct module_layout *layout,
+		      int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base,
+		   layout->text_size >> PAGE_SHIFT);
+}
+
+static void module_enable_x(const struct module *mod)
+{
+	frob_text(&mod->core_layout, set_memory_x);
+	frob_text(&mod->init_layout, set_memory_x);
+}
+#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+static void module_enable_x(const struct module *mod) { }
+#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+
+#ifdef CONFIG_STRICT_MODULE_RWX
+static void frob_rodata(const struct module_layout *layout,
+			int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->text_size,
+		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
+}
+
+static void frob_ro_after_init(const struct module_layout *layout,
+				int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
+static void frob_writable_data(const struct module_layout *layout,
+			       int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
+}
+
+static void module_enable_ro(const struct module *mod, bool after_init)
+{
+	if (!rodata_enabled)
+		return;
+
+	set_vm_flush_reset_perms(mod->core_layout.base);
+	set_vm_flush_reset_perms(mod->init_layout.base);
+	frob_text(&mod->core_layout, set_memory_ro);
+
+	frob_rodata(&mod->core_layout, set_memory_ro);
+	frob_text(&mod->init_layout, set_memory_ro);
+	frob_rodata(&mod->init_layout, set_memory_ro);
+
+	if (after_init)
+		frob_ro_after_init(&mod->core_layout, set_memory_ro);
+}
+
+static void module_enable_nx(const struct module *mod)
+{
+	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_ro_after_init(&mod->core_layout, set_memory_nx);
+	frob_writable_data(&mod->core_layout, set_memory_nx);
+	frob_rodata(&mod->init_layout, set_memory_nx);
+	frob_writable_data(&mod->init_layout, set_memory_nx);
+}
+
+static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+				       char *secstrings, struct module *mod)
+{
+	const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
+	int i;
+
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+			pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+				mod->name, secstrings + sechdrs[i].sh_name, i);
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
+
+#else /* !CONFIG_STRICT_MODULE_RWX */
+static void module_enable_nx(const struct module *mod) { }
+static void module_enable_ro(const struct module *mod, bool after_init) {}
+static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+				       char *secstrings, struct module *mod)
+{
+	return 0;
+}
+#endif /*  CONFIG_STRICT_MODULE_RWX */
+
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	unsigned int size, symndx;
+	int ret;
+
+	size = sizeof(*mod->klp_info);
+	mod->klp_info = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info == NULL)
+		return -ENOMEM;
+
+	/* Elf header */
+	size = sizeof(mod->klp_info->hdr);
+	memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+	/* Elf section header table */
+	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+	if (mod->klp_info->sechdrs == NULL) {
+		ret = -ENOMEM;
+		goto free_info;
+	}
+
+	/* Elf section name string table */
+	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+	if (mod->klp_info->secstrings == NULL) {
+		ret = -ENOMEM;
+		goto free_sechdrs;
+	}
+
+	/* Elf symbol section index */
+	symndx = info->index.sym;
+	mod->klp_info->symndx = symndx;
+
+	/*
+	 * For livepatch modules, core_kallsyms.symtab is a complete
+	 * copy of the original symbol table. Adjust sh_addr to point
+	 * to core_kallsyms.symtab since the copy of the symtab in module
+	 * init memory is freed at the end of do_init_module().
+	 */
+	mod->klp_info->sechdrs[symndx].sh_addr = \
+		(unsigned long) mod->core_kallsyms.symtab;
+
+	return 0;
+
+free_sechdrs:
+	kfree(mod->klp_info->sechdrs);
+free_info:
+	kfree(mod->klp_info);
+	return ret;
+}
+
+static void free_module_elf(struct module *mod)
+{
+	kfree(mod->klp_info->sechdrs);
+	kfree(mod->klp_info->secstrings);
+	kfree(mod->klp_info);
+}
+#else /* !CONFIG_LIVEPATCH */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	return 0;
+}
+
+static void free_module_elf(struct module *mod)
+{
+}
+#endif /* CONFIG_LIVEPATCH */
+
+void __weak module_memfree(void *module_region)
+{
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
+	vfree(module_region);
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
+static void cfi_cleanup(struct module *mod);
+
+/* Free a module, remove from lists, etc. */
+static void free_module(struct module *mod)
+{
+	trace_module_free(mod);
+
+	mod_sysfs_teardown(mod);
+
+	/*
+	 * We leave it in list to prevent duplicate loads, but make sure
+	 * that noone uses it while it's being deconstructed.
+	 */
+	mutex_lock(&module_mutex);
+	mod->state = MODULE_STATE_UNFORMED;
+	mutex_unlock(&module_mutex);
+
+	/* Remove dynamic debug info */
+	ddebug_remove_module(mod->name);
+
+	/* Arch-specific cleanup. */
+	module_arch_cleanup(mod);
+
+	/* Module unload stuff */
+	module_unload_free(mod);
+
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
+	if (is_livepatch_module(mod))
+		free_module_elf(mod);
+
+	/* Now we can delete it from the lists */
+	mutex_lock(&module_mutex);
+	/* Unlink carefully: kallsyms could be walking list. */
+	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
+	/* Remove this module from bug list, this uses list_del_rcu */
+	module_bug_cleanup(mod);
+	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+	synchronize_rcu();
+	mutex_unlock(&module_mutex);
+
+	/* Clean up CFI for the module. */
+	cfi_cleanup(mod);
+
+	/* This may be empty, but that's OK */
+	module_arch_freeing_init(mod);
+	module_memfree(mod->init_layout.base);
+	kfree(mod->args);
+	percpu_modfree(mod);
+
+	/* Free lock-classes; relies on the preceding sync_rcu(). */
+	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+
+	/* Finally, free the core (containing the module structure) */
+	module_memfree(mod->core_layout.base);
+}
+
+void *__symbol_get(const char *symbol)
+{
+	struct find_symbol_arg fsa = {
+		.name	= symbol,
+		.gplok	= true,
+		.warn	= true,
+	};
+
+	preempt_disable();
+	if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) {
+		preempt_enable();
+		return NULL;
+	}
+	preempt_enable();
+	return (void *)kernel_symbol_value(fsa.sym);
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
+ */
+static int verify_exported_symbols(struct module *mod)
+{
+	unsigned int i;
+	const struct kernel_symbol *s;
+	struct {
+		const struct kernel_symbol *sym;
+		unsigned int num;
+	} arr[] = {
+		{ mod->syms, mod->num_syms },
+		{ mod->gpl_syms, mod->num_gpl_syms },
+	};
+
+	for (i = 0; i < ARRAY_SIZE(arr); i++) {
+		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+			struct find_symbol_arg fsa = {
+				.name	= kernel_symbol_name(s),
+				.gplok	= true,
+			};
+			if (find_symbol(&fsa)) {
+				pr_err("%s: exports duplicate symbol %s"
+				       " (owned by %s)\n",
+				       mod->name, kernel_symbol_name(s),
+				       module_name(fsa.owner));
+				return -ENOEXEC;
+			}
+		}
+	}
+	return 0;
+}
+
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+	/*
+	 * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+	 * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+	 * i386 has a similar problem but may not deserve a fix.
+	 *
+	 * If we ever have to ignore many symbols, consider refactoring the code to
+	 * only warn if referenced by a relocation.
+	 */
+	if (emachine == EM_386 || emachine == EM_X86_64)
+		return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+	return false;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+	Elf_Sym *sym = (void *)symsec->sh_addr;
+	unsigned long secbase;
+	unsigned int i;
+	int ret = 0;
+	const struct kernel_symbol *ksym;
+
+	for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+		const char *name = info->strtab + sym[i].st_name;
+
+		switch (sym[i].st_shndx) {
+		case SHN_COMMON:
+			/* Ignore common symbols */
+			if (!strncmp(name, "__gnu_lto", 9))
+				break;
+
+			/*
+			 * We compiled with -fno-common.  These are not
+			 * supposed to happen.
+			 */
+			pr_debug("Common symbol: %s\n", name);
+			pr_warn("%s: please compile with -fno-common\n",
+			       mod->name);
+			ret = -ENOEXEC;
+			break;
+
+		case SHN_ABS:
+			/* Don't need to do anything */
+			pr_debug("Absolute symbol: 0x%08lx\n",
+			       (long)sym[i].st_value);
+			break;
+
+		case SHN_LIVEPATCH:
+			/* Livepatch symbols are resolved by livepatch */
+			break;
+
+		case SHN_UNDEF:
+			ksym = resolve_symbol_wait(mod, info, name);
+			/* Ok if resolved.  */
+			if (ksym && !IS_ERR(ksym)) {
+				sym[i].st_value = kernel_symbol_value(ksym);
+				break;
+			}
+
+			/* Ok if weak or ignored.  */
+			if (!ksym &&
+			    (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+			     ignore_undef_symbol(info->hdr->e_machine, name)))
+				break;
+
+			ret = PTR_ERR(ksym) ?: -ENOENT;
+			pr_warn("%s: Unknown symbol %s (err %d)\n",
+				mod->name, name, ret);
+			break;
+
+		default:
+			/* Divert to percpu allocation if a percpu var. */
+			if (sym[i].st_shndx == info->index.pcpu)
+				secbase = (unsigned long)mod_percpu(mod);
+			else
+				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+			sym[i].st_value += secbase;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+	unsigned int i;
+	int err = 0;
+
+	/* Now do relocations. */
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		unsigned int infosec = info->sechdrs[i].sh_info;
+
+		/* Not a valid relocation section? */
+		if (infosec >= info->hdr->e_shnum)
+			continue;
+
+		/* Don't bother with non-allocated sections */
+		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+			continue;
+
+		if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+			err = klp_apply_section_relocs(mod, info->sechdrs,
+						       info->secstrings,
+						       info->strtab,
+						       info->index.sym, i,
+						       NULL);
+		else if (info->sechdrs[i].sh_type == SHT_REL)
+			err = apply_relocate(info->sechdrs, info->strtab,
+					     info->index.sym, i, mod);
+		else if (info->sechdrs[i].sh_type == SHT_RELA)
+			err = apply_relocate_add(info->sechdrs, info->strtab,
+						 info->index.sym, i, mod);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+					     unsigned int section)
+{
+	/* default implementation just returns zero */
+	return 0;
+}
+
+/* Update size with this section: return offset. */
+static long get_offset(struct module *mod, unsigned int *size,
+		       Elf_Shdr *sechdr, unsigned int section)
+{
+	long ret;
+
+	*size += arch_mod_section_prepend(mod, section);
+	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+	*size = ret + sechdr->sh_size;
+	return ret;
+}
+
+static bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+	if (module_exit_section(sname))
+		return true;
+#endif
+	return module_init_section(sname);
+}
+
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data.  Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+	static unsigned long const masks[][2] = {
+		/*
+		 * NOTE: all executable code must be the first section
+		 * in this array; otherwise modify the text_size
+		 * finder in the two loops below
+		 */
+		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
+		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+	};
+	unsigned int m, i;
+
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		info->sechdrs[i].sh_entsize = ~0UL;
+
+	pr_debug("Core section allocation order:\n");
+	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+		for (i = 0; i < info->hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &info->sechdrs[i];
+			const char *sname = info->secstrings + s->sh_name;
+
+			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+			    || (s->sh_flags & masks[m][1])
+			    || s->sh_entsize != ~0UL
+			    || module_init_layout_section(sname))
+				continue;
+			s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
+			pr_debug("\t%s\n", sname);
+		}
+		switch (m) {
+		case 0: /* executable */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.text_size = mod->core_layout.size;
+			break;
+		case 1: /* RO: text and ro-data */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_size = mod->core_layout.size;
+			break;
+		case 2: /* RO after init */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+			break;
+		case 4: /* whole core */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			break;
+		}
+	}
+
+	pr_debug("Init section allocation order:\n");
+	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+		for (i = 0; i < info->hdr->e_shnum; ++i) {
+			Elf_Shdr *s = &info->sechdrs[i];
+			const char *sname = info->secstrings + s->sh_name;
+
+			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+			    || (s->sh_flags & masks[m][1])
+			    || s->sh_entsize != ~0UL
+			    || !module_init_layout_section(sname))
+				continue;
+			s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
+					 | INIT_OFFSET_MASK);
+			pr_debug("\t%s\n", sname);
+		}
+		switch (m) {
+		case 0: /* executable */
+			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.text_size = mod->init_layout.size;
+			break;
+		case 1: /* RO: text and ro-data */
+			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.ro_size = mod->init_layout.size;
+			break;
+		case 2:
+			/*
+			 * RO after init doesn't apply to init_layout (only
+			 * core_layout), so it just takes the value of ro_size.
+			 */
+			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+			break;
+		case 4: /* whole init */
+			mod->init_layout.size = debug_align(mod->init_layout.size);
+			break;
+		}
+	}
+}
+
+static void set_license(struct module *mod, const char *license)
+{
+	if (!license)
+		license = "unspecified";
+
+	if (!license_is_gpl_compatible(license)) {
+		if (!test_taint(TAINT_PROPRIETARY_MODULE))
+			pr_warn("%s: module license '%s' taints kernel.\n",
+				mod->name, license);
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+				 LOCKDEP_NOW_UNRELIABLE);
+	}
+}
+
+/* Parse tag=value strings from .modinfo section */
+static char *next_string(char *string, unsigned long *secsize)
+{
+	/* Skip non-zero chars */
+	while (string[0]) {
+		string++;
+		if ((*secsize)-- <= 1)
+			return NULL;
+	}
+
+	/* Skip any zero padding. */
+	while (!string[0]) {
+		string++;
+		if ((*secsize)-- <= 1)
+			return NULL;
+	}
+	return string;
+}
+
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+			      char *prev)
+{
+	char *p;
+	unsigned int taglen = strlen(tag);
+	Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+	unsigned long size = infosec->sh_size;
+
+	/*
+	 * get_modinfo() calls made before rewrite_section_headers()
+	 * must use sh_offset, as sh_addr isn't set!
+	 */
+	char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+	if (prev) {
+		size -= prev - modinfo;
+		modinfo = next_string(prev, &size);
+	}
+
+	for (p = modinfo; p; p = next_string(p, &size)) {
+		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+			return p + taglen + 1;
+	}
+	return NULL;
+}
+
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+	return get_next_modinfo(info, tag, NULL);
+}
+
+static void setup_modinfo(struct module *mod, struct load_info *info)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (attr->setup)
+			attr->setup(mod, get_modinfo(info, attr->attr.name));
+	}
+}
+
+static void free_modinfo(struct module *mod)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (attr->free)
+			attr->free(mod);
+	}
+}
+
+#ifdef CONFIG_KALLSYMS
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+							  const struct kernel_symbol *start,
+							  const struct kernel_symbol *stop)
+{
+	return bsearch(name, start, stop - start,
+			sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+		       const struct module *mod)
+{
+	const struct kernel_symbol *ks;
+	if (!mod)
+		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+	else
+		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+	return ks != NULL && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+	const Elf_Shdr *sechdrs = info->sechdrs;
+
+	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+			return 'v';
+		else
+			return 'w';
+	}
+	if (sym->st_shndx == SHN_UNDEF)
+		return 'U';
+	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+		return 'a';
+	if (sym->st_shndx >= SHN_LORESERVE)
+		return '?';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+		return 't';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+	    && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+			return 'r';
+		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 'g';
+		else
+			return 'd';
+	}
+	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 's';
+		else
+			return 'b';
+	}
+	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+		      ".debug")) {
+		return 'n';
+	}
+	return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+			unsigned int shnum, unsigned int pcpundx)
+{
+	const Elf_Shdr *sec;
+
+	if (src->st_shndx == SHN_UNDEF
+	    || src->st_shndx >= shnum
+	    || !src->st_name)
+		return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+	if (src->st_shndx == pcpundx)
+		return true;
+#endif
+
+	sec = sechdrs + src->st_shndx;
+	if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+	    || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+		return false;
+
+	return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+static void layout_symtab(struct module *mod, struct load_info *info)
+{
+	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+	Elf_Shdr *strsect = info->sechdrs + info->index.str;
+	const Elf_Sym *src;
+	unsigned int i, nsrc, ndst, strtab_size = 0;
+
+	/* Put symbol section at end of init part of module. */
+	symsect->sh_flags |= SHF_ALLOC;
+	symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
+					 info->index.sym) | INIT_OFFSET_MASK;
+	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+	src = (void *)info->hdr + symsect->sh_offset;
+	nsrc = symsect->sh_size / sizeof(*src);
+
+	/* Compute total space required for the core symbols' strtab. */
+	for (ndst = i = 0; i < nsrc; i++) {
+		if (i == 0 || is_livepatch_module(mod) ||
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
+			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
+			ndst++;
+		}
+	}
+
+	/* Append room for core symbols at end of core part. */
+	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->core_layout.size += strtab_size;
+	info->core_typeoffs = mod->core_layout.size;
+	mod->core_layout.size += ndst * sizeof(char);
+	mod->core_layout.size = debug_align(mod->core_layout.size);
+
+	/* Put string table section at end of init part of module. */
+	strsect->sh_flags |= SHF_ALLOC;
+	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
+					 info->index.str) | INIT_OFFSET_MASK;
+	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+	/* We'll tack temporary mod_kallsyms on the end. */
+	mod->init_layout.size = ALIGN(mod->init_layout.size,
+				      __alignof__(struct mod_kallsyms));
+	info->mod_kallsyms_init_off = mod->init_layout.size;
+	mod->init_layout.size += sizeof(struct mod_kallsyms);
+	info->init_typeoffs = mod->init_layout.size;
+	mod->init_layout.size += nsrc * sizeof(char);
+	mod->init_layout.size = debug_align(mod->init_layout.size);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section.  Later we switch to the cut-down
+ * core-only ones.
+ */
+static void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+	unsigned int i, ndst;
+	const Elf_Sym *src;
+	Elf_Sym *dst;
+	char *s;
+	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+
+	/* Set up to point into init section. */
+	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+	mod->kallsyms->symtab = (void *)symsec->sh_addr;
+	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	/* Make sure we get permanent strtab: don't use info->strtab. */
+	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
+
+	/*
+	 * Now populate the cut down core kallsyms for after init
+	 * and set types up while we still have access to sections.
+	 */
+	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
+	src = mod->kallsyms->symtab;
+	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+		mod->kallsyms->typetab[i] = elf_type(src + i, info);
+		if (i == 0 || is_livepatch_module(mod) ||
+		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
+			mod->core_kallsyms.typetab[ndst] =
+			    mod->kallsyms->typetab[i];
+			dst[ndst] = src[i];
+			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
+				     KSYM_NAME_LEN) + 1;
+		}
+	}
+	mod->core_kallsyms.num_symtab = ndst;
+}
+#else
+static inline void layout_symtab(struct module *mod, struct load_info *info)
+{
+}
+
+static void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+	const Elf_Shdr *sechdr;
+	unsigned int i;
+
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		sechdr = &info->sechdrs[i];
+		if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+		    !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+					sechdr->sh_size))
+			break;
+	}
+}
+#else
+static void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
+{
+	if (!debug)
+		return;
+	ddebug_add_module(debug, num, mod->name);
+}
+
+static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
+{
+	if (debug)
+		ddebug_remove_module(mod->name);
+}
+
+void * __weak module_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+			NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+bool __weak module_init_section(const char *name)
+{
+	return strstarts(name, ".init");
+}
+
+bool __weak module_exit_section(const char *name)
+{
+	return strstarts(name, ".exit");
+}
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(const struct module *mod,
+				 const struct load_info *info)
+{
+	unsigned int i;
+
+	/* only scan the sections containing data */
+	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		/* Scan all writable sections that's not executable */
+		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+			continue;
+
+		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+				   info->sechdrs[i].sh_size, GFP_KERNEL);
+	}
+}
+#else
+static inline void kmemleak_load_module(const struct module *mod,
+					const struct load_info *info)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info, int flags)
+{
+	int err = -ENODATA;
+	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+	const char *reason;
+	const void *mod = info->hdr;
+	bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+				       MODULE_INIT_IGNORE_VERMAGIC);
+	/*
+	 * Do not allow mangled modules as a module with version information
+	 * removed is no longer the module that was signed.
+	 */
+	if (!mangled_module &&
+	    info->len > markerlen &&
+	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+		/* We truncate the module to discard the signature */
+		info->len -= markerlen;
+		err = mod_verify_sig(mod, info);
+		if (!err) {
+			info->sig_ok = true;
+			return 0;
+		}
+	}
+
+	/*
+	 * We don't permit modules to be loaded into the trusted kernels
+	 * without a valid signature on them, but if we're not enforcing,
+	 * certain errors are non-fatal.
+	 */
+	switch (err) {
+	case -ENODATA:
+		reason = "unsigned module";
+		break;
+	case -ENOPKG:
+		reason = "module with unsupported crypto";
+		break;
+	case -ENOKEY:
+		reason = "module with unavailable key";
+		break;
+
+	default:
+		/*
+		 * All other errors are fatal, including lack of memory,
+		 * unparseable signatures, and signature check failures --
+		 * even if signatures aren't required.
+		 */
+		return err;
+	}
+
+	if (is_module_sig_enforced()) {
+		pr_notice("Loading of %s is rejected\n", reason);
+		return -EKEYREJECTED;
+	}
+
+	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info, int flags)
+{
+	return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
+{
+#if defined(CONFIG_64BIT)
+	unsigned long long secend;
+#else
+	unsigned long secend;
+#endif
+
+	/*
+	 * Check for both overflow and offset/size being
+	 * too large.
+	 */
+	secend = shdr->sh_offset + shdr->sh_size;
+	if (secend < shdr->sh_offset || secend > info->len)
+		return -ENOEXEC;
+
+	return 0;
+}
+
+/*
+ * Sanity checks against invalid binaries, wrong arch, weird elf version.
+ *
+ * Also do basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ */
+static int elf_validity_check(struct load_info *info)
+{
+	unsigned int i;
+	Elf_Shdr *shdr, *strhdr;
+	int err;
+
+	if (info->len < sizeof(*(info->hdr))) {
+		pr_err("Invalid ELF header len %lu\n", info->len);
+		goto no_exec;
+	}
+
+	if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+		pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+		goto no_exec;
+	}
+	if (info->hdr->e_type != ET_REL) {
+		pr_err("Invalid ELF header type: %u != %u\n",
+		       info->hdr->e_type, ET_REL);
+		goto no_exec;
+	}
+	if (!elf_check_arch(info->hdr)) {
+		pr_err("Invalid architecture in ELF header: %u\n",
+		       info->hdr->e_machine);
+		goto no_exec;
+	}
+	if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+		pr_err("Invalid ELF section header size\n");
+		goto no_exec;
+	}
+
+	/*
+	 * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+	 * known and small. So e_shnum * sizeof(Elf_Shdr)
+	 * will not overflow unsigned long on any platform.
+	 */
+	if (info->hdr->e_shoff >= info->len
+	    || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+		info->len - info->hdr->e_shoff)) {
+		pr_err("Invalid ELF section header overflow\n");
+		goto no_exec;
+	}
+
+	info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+	/*
+	 * Verify if the section name table index is valid.
+	 */
+	if (info->hdr->e_shstrndx == SHN_UNDEF
+	    || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+		pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+		       info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+		       info->hdr->e_shnum);
+		goto no_exec;
+	}
+
+	strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+	err = validate_section_offset(info, strhdr);
+	if (err < 0) {
+		pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
+		return err;
+	}
+
+	/*
+	 * The section name table must be NUL-terminated, as required
+	 * by the spec. This makes strcmp and pr_* calls that access
+	 * strings in the section safe.
+	 */
+	info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+	if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+		pr_err("ELF Spec violation: section name table isn't null terminated\n");
+		goto no_exec;
+	}
+
+	/*
+	 * The code assumes that section 0 has a length of zero and
+	 * an addr of zero, so check for it.
+	 */
+	if (info->sechdrs[0].sh_type != SHT_NULL
+	    || info->sechdrs[0].sh_size != 0
+	    || info->sechdrs[0].sh_addr != 0) {
+		pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+		       info->sechdrs[0].sh_type);
+		goto no_exec;
+	}
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		shdr = &info->sechdrs[i];
+		switch (shdr->sh_type) {
+		case SHT_NULL:
+		case SHT_NOBITS:
+			continue;
+		case SHT_SYMTAB:
+			if (shdr->sh_link == SHN_UNDEF
+			    || shdr->sh_link >= info->hdr->e_shnum) {
+				pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+				       shdr->sh_link, shdr->sh_link,
+				       info->hdr->e_shnum);
+				goto no_exec;
+			}
+			fallthrough;
+		default:
+			err = validate_section_offset(info, shdr);
+			if (err < 0) {
+				pr_err("Invalid ELF section in module (section %u type %u)\n",
+					i, shdr->sh_type);
+				return err;
+			}
+
+			if (shdr->sh_flags & SHF_ALLOC) {
+				if (shdr->sh_name >= strhdr->sh_size) {
+					pr_err("Invalid ELF section name in module (section %u type %u)\n",
+					       i, shdr->sh_type);
+					return -ENOEXEC;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+
+no_exec:
+	return -ENOEXEC;
+}
+
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+	do {
+		unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+		if (copy_from_user(dst, usrc, n) != 0)
+			return -EFAULT;
+		cond_resched();
+		dst += n;
+		usrc += n;
+		len -= n;
+	} while (len);
+	return 0;
+}
+
+#ifdef CONFIG_LIVEPATCH
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+	if (get_modinfo(info, "livepatch")) {
+		mod->klp = true;
+		add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+		pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+			       mod->name);
+	}
+
+	return 0;
+}
+#else /* !CONFIG_LIVEPATCH */
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+	if (get_modinfo(info, "livepatch")) {
+		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+		       mod->name);
+		return -ENOEXEC;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+	if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+		return;
+
+	pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+		mod->name);
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+				  struct load_info *info)
+{
+	int err;
+
+	info->len = len;
+	if (info->len < sizeof(*(info->hdr)))
+		return -ENOEXEC;
+
+	err = security_kernel_load_data(LOADING_MODULE, true);
+	if (err)
+		return err;
+
+	/* Suck in entire file: we'll want most of it. */
+	info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
+	if (!info->hdr)
+		return -ENOMEM;
+
+	if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = security_kernel_post_load_data((char *)info->hdr, info->len,
+					     LOADING_MODULE, "init_module");
+out:
+	if (err)
+		vfree(info->hdr);
+
+	return err;
+}
+
+static void free_copy(struct load_info *info, int flags)
+{
+	if (flags & MODULE_INIT_COMPRESSED_FILE)
+		module_decompress_cleanup(info);
+	else
+		vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info, int flags)
+{
+	unsigned int i;
+
+	/* This should always be true, but let's be sure. */
+	info->sechdrs[0].sh_addr = 0;
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *shdr = &info->sechdrs[i];
+
+		/*
+		 * Mark all sections sh_addr with their address in the
+		 * temporary image.
+		 */
+		shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+
+	}
+
+	/* Track but don't keep modinfo and version sections. */
+	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+	return 0;
+}
+
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Set info->mod to the temporary copy of the module in info->hdr. The final one
+ * will be allocated in move_module().
+ */
+static int setup_load_info(struct load_info *info, int flags)
+{
+	unsigned int i;
+
+	/* Try to find a name early so we can log errors with a module name */
+	info->index.info = find_sec(info, ".modinfo");
+	if (info->index.info)
+		info->name = get_modinfo(info, "name");
+
+	/* Find internal symbols and strings. */
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+			info->index.sym = i;
+			info->index.str = info->sechdrs[i].sh_link;
+			info->strtab = (char *)info->hdr
+				+ info->sechdrs[info->index.str].sh_offset;
+			break;
+		}
+	}
+
+	if (info->index.sym == 0) {
+		pr_warn("%s: module has no symbols (stripped?)\n",
+			info->name ?: "(missing .modinfo section or name field)");
+		return -ENOEXEC;
+	}
+
+	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+	if (!info->index.mod) {
+		pr_warn("%s: No module found in object\n",
+			info->name ?: "(missing .modinfo section or name field)");
+		return -ENOEXEC;
+	}
+	/* This is temporary: point mod into copy of data. */
+	info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+
+	/*
+	 * If we didn't load the .modinfo 'name' field earlier, fall back to
+	 * on-disk struct mod 'name' field.
+	 */
+	if (!info->name)
+		info->name = info->mod->name;
+
+	if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+		info->index.vers = 0; /* Pretend no __versions section! */
+	else
+		info->index.vers = find_sec(info, "__versions");
+
+	info->index.pcpu = find_pcpusec(info);
+
+	return 0;
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+{
+	const char *modmagic = get_modinfo(info, "vermagic");
+	int err;
+
+	if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+		modmagic = NULL;
+
+	/* This is allowed: modprobe --force will invalidate it. */
+	if (!modmagic) {
+		err = try_to_force_load(mod, "bad vermagic");
+		if (err)
+			return err;
+	} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+		pr_err("%s: version magic '%s' should be '%s'\n",
+		       info->name, modmagic, vermagic);
+		return -ENOEXEC;
+	}
+
+	if (!get_modinfo(info, "intree")) {
+		if (!test_taint(TAINT_OOT_MODULE))
+			pr_warn("%s: loading out-of-tree module taints kernel.\n",
+				mod->name);
+		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+	}
+
+	check_modinfo_retpoline(mod, info);
+
+	if (get_modinfo(info, "staging")) {
+		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+		pr_warn("%s: module is from the staging directory, the quality "
+			"is unknown, you have been warned.\n", mod->name);
+	}
+
+	err = check_modinfo_livepatch(mod, info);
+	if (err)
+		return err;
+
+	/* Set up license info based on the info section */
+	set_license(mod, get_modinfo(info, "license"));
+
+	return 0;
+}
+
+static int find_module_sections(struct module *mod, struct load_info *info)
+{
+	mod->kp = section_objs(info, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
+	mod->syms = section_objs(info, "__ksymtab",
+				 sizeof(*mod->syms), &mod->num_syms);
+	mod->crcs = section_addr(info, "__kcrctab");
+	mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+				     sizeof(*mod->gpl_syms),
+				     &mod->num_gpl_syms);
+	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+
+#ifdef CONFIG_CONSTRUCTORS
+	mod->ctors = section_objs(info, ".ctors",
+				  sizeof(*mod->ctors), &mod->num_ctors);
+	if (!mod->ctors)
+		mod->ctors = section_objs(info, ".init_array",
+				sizeof(*mod->ctors), &mod->num_ctors);
+	else if (find_sec(info, ".init_array")) {
+		/*
+		 * This shouldn't happen with same compiler and binutils
+		 * building all parts of the module.
+		 */
+		pr_warn("%s: has both .ctors and .init_array.\n",
+		       mod->name);
+		return -EINVAL;
+	}
+#endif
+
+	mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
+						&mod->noinstr_text_size);
+
+#ifdef CONFIG_TRACEPOINTS
+	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+					     sizeof(*mod->tracepoints_ptrs),
+					     &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_TREE_SRCU
+	mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+					     sizeof(*mod->srcu_struct_ptrs),
+					     &mod->num_srcu_structs);
+#endif
+#ifdef CONFIG_BPF_EVENTS
+	mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+					   sizeof(*mod->bpf_raw_events),
+					   &mod->num_bpf_raw_events);
+#endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+#endif
+#ifdef CONFIG_JUMP_LABEL
+	mod->jump_entries = section_objs(info, "__jump_table",
+					sizeof(*mod->jump_entries),
+					&mod->num_jump_entries);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(info, "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+	mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+					sizeof(*mod->trace_evals),
+					&mod->num_trace_evals);
+#endif
+#ifdef CONFIG_TRACING
+	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+					 sizeof(*mod->trace_bprintk_fmt_start),
+					 &mod->num_trace_bprintk_fmt);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+	mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+					    sizeof(*mod->ei_funcs),
+					    &mod->num_ei_funcs);
+#endif
+#ifdef CONFIG_KPROBES
+	mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
+						&mod->kprobes_text_size);
+	mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
+						sizeof(unsigned long),
+						&mod->num_kprobe_blacklist);
+#endif
+#ifdef CONFIG_PRINTK_INDEX
+	mod->printk_index_start = section_objs(info, ".printk_index",
+					       sizeof(*mod->printk_index_start),
+					       &mod->printk_index_size);
+#endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+	mod->static_call_sites = section_objs(info, ".static_call_sites",
+					      sizeof(*mod->static_call_sites),
+					      &mod->num_static_call_sites);
+#endif
+	mod->extable = section_objs(info, "__ex_table",
+				    sizeof(*mod->extable), &mod->num_exentries);
+
+	if (section_addr(info, "__obsparm"))
+		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+
+	info->debug = section_objs(info, "__dyndbg",
+				   sizeof(*info->debug), &info->num_debug);
+
+	return 0;
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+	int i;
+	void *ptr;
+
+	/* Do the allocs. */
+	ptr = module_alloc(mod->core_layout.size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. Just mark it as not being a
+	 * leak.
+	 */
+	kmemleak_not_leak(ptr);
+	if (!ptr)
+		return -ENOMEM;
+
+	memset(ptr, 0, mod->core_layout.size);
+	mod->core_layout.base = ptr;
+
+	if (mod->init_layout.size) {
+		ptr = module_alloc(mod->init_layout.size);
+		/*
+		 * The pointer to this block is stored in the module structure
+		 * which is inside the block. This block doesn't need to be
+		 * scanned as it contains data and code that will be freed
+		 * after the module is initialized.
+		 */
+		kmemleak_ignore(ptr);
+		if (!ptr) {
+			module_memfree(mod->core_layout.base);
+			return -ENOMEM;
+		}
+		memset(ptr, 0, mod->init_layout.size);
+		mod->init_layout.base = ptr;
+	} else
+		mod->init_layout.base = NULL;
+
+	/* Transfer each section which specifies SHF_ALLOC */
+	pr_debug("final section addresses:\n");
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		void *dest;
+		Elf_Shdr *shdr = &info->sechdrs[i];
+
+		if (!(shdr->sh_flags & SHF_ALLOC))
+			continue;
+
+		if (shdr->sh_entsize & INIT_OFFSET_MASK)
+			dest = mod->init_layout.base
+				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
+		else
+			dest = mod->core_layout.base + shdr->sh_entsize;
+
+		if (shdr->sh_type != SHT_NOBITS)
+			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+		/* Update sh_addr to point to copy in image. */
+		shdr->sh_addr = (unsigned long)dest;
+		pr_debug("\t0x%lx %s\n",
+			 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+	}
+
+	return 0;
+}
+
+static int check_module_license_and_versions(struct module *mod)
+{
+	int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
+	/*
+	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
+	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
+	 * using GPL-only symbols it needs.
+	 */
+	if (strcmp(mod->name, "ndiswrapper") == 0)
+		add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+
+	/* driverloader was caught wrongly pretending to be under GPL */
+	if (strcmp(mod->name, "driverloader") == 0)
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+				 LOCKDEP_NOW_UNRELIABLE);
+
+	/* lve claims to be GPL but upstream won't provide source */
+	if (strcmp(mod->name, "lve") == 0)
+		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+				 LOCKDEP_NOW_UNRELIABLE);
+
+	if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+		pr_warn("%s: module license taints kernel.\n", mod->name);
+
+#ifdef CONFIG_MODVERSIONS
+	if ((mod->num_syms && !mod->crcs) ||
+	    (mod->num_gpl_syms && !mod->gpl_crcs)) {
+		return try_to_force_load(mod,
+					 "no versions for exported symbols");
+	}
+#endif
+	return 0;
+}
+
+static void flush_module_icache(const struct module *mod)
+{
+	/*
+	 * Flush the instruction cache, since we've played with text.
+	 * Do it before processing of module parameters, so the module
+	 * can provide parameter accessor functions of its own.
+	 */
+	if (mod->init_layout.base)
+		flush_icache_range((unsigned long)mod->init_layout.base,
+				   (unsigned long)mod->init_layout.base
+				   + mod->init_layout.size);
+	flush_icache_range((unsigned long)mod->core_layout.base,
+			   (unsigned long)mod->core_layout.base + mod->core_layout.size);
+}
+
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+				     Elf_Shdr *sechdrs,
+				     char *secstrings,
+				     struct module *mod)
+{
+	return 0;
+}
+
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(const char *module_name)
+{
+	const char *p;
+	size_t len;
+
+	if (!module_blacklist)
+		return false;
+
+	for (p = module_blacklist; *p; p += len) {
+		len = strcspn(p, ",");
+		if (strlen(module_name) == len && !memcmp(module_name, p, len))
+			return true;
+		if (p[len] == ',')
+			len++;
+	}
+	return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
+static struct module *layout_and_allocate(struct load_info *info, int flags)
+{
+	struct module *mod;
+	unsigned int ndx;
+	int err;
+
+	err = check_modinfo(info->mod, info, flags);
+	if (err)
+		return ERR_PTR(err);
+
+	/* Allow arches to frob section contents and sizes.  */
+	err = module_frob_arch_sections(info->hdr, info->sechdrs,
+					info->secstrings, info->mod);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
+					  info->secstrings, info->mod);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* We will do a special allocation for per-cpu sections later. */
+	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+	/*
+	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+	 * layout_sections() can put it in the right place.
+	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+	 */
+	ndx = find_sec(info, ".data..ro_after_init");
+	if (ndx)
+		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+	/*
+	 * Mark the __jump_table section as ro_after_init as well: these data
+	 * structures are never modified, with the exception of entries that
+	 * refer to code in the __init section, which are annotated as such
+	 * at module load time.
+	 */
+	ndx = find_sec(info, "__jump_table");
+	if (ndx)
+		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
+	/*
+	 * Determine total sizes, and put offsets in sh_entsize.  For now
+	 * this is done generically; there doesn't appear to be any
+	 * special cases for the architectures.
+	 */
+	layout_sections(info->mod, info);
+	layout_symtab(info->mod, info);
+
+	/* Allocate and move to the final place */
+	err = move_module(info->mod, info);
+	if (err)
+		return ERR_PTR(err);
+
+	/* Module has been copied to its final place now: return it. */
+	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+	kmemleak_load_module(mod, info);
+	return mod;
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+	percpu_modfree(mod);
+	module_arch_freeing_init(mod);
+	module_memfree(mod->init_layout.base);
+	module_memfree(mod->core_layout.base);
+}
+
+int __weak module_finalize(const Elf_Ehdr *hdr,
+			   const Elf_Shdr *sechdrs,
+			   struct module *me)
+{
+	return 0;
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+	/* Sort exception table now relocations are done. */
+	sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+	/* Copy relocated percpu area over. */
+	percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+		       info->sechdrs[info->index.pcpu].sh_size);
+
+	/* Setup kallsyms-specific fields. */
+	add_kallsyms(mod, info);
+
+	/* Arch-specific module finalizing. */
+	return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Is this module of this name done loading?  No locks held. */
+static bool finished_loading(const char *name)
+{
+	struct module *mod;
+	bool ret;
+
+	/*
+	 * The module_mutex should not be a heavily contended lock;
+	 * if we get the occasional sleep here, we'll go an extra iteration
+	 * in the wait_event_interruptible(), which is harmless.
+	 */
+	sched_annotate_sleep();
+	mutex_lock(&module_mutex);
+	mod = find_module_all(name, strlen(name), true);
+	ret = !mod || mod->state == MODULE_STATE_LIVE;
+	mutex_unlock(&module_mutex);
+
+	return ret;
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+	unsigned long i;
+
+	for (i = 0; i < mod->num_ctors; i++)
+		mod->ctors[i]();
+#endif
+}
+
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+	struct llist_node node;
+	void *module_init;
+};
+
+static void do_free_init(struct work_struct *w)
+{
+	struct llist_node *pos, *n, *list;
+	struct mod_initfree *initfree;
+
+	list = llist_del_all(&init_free_list);
+
+	synchronize_rcu();
+
+	llist_for_each_safe(pos, n, list) {
+		initfree = container_of(pos, struct mod_initfree, node);
+		module_memfree(initfree->module_init);
+		kfree(initfree);
+	}
+}
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
+{
+	int ret = 0;
+	struct mod_initfree *freeinit;
+
+	freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+	if (!freeinit) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	freeinit->module_init = mod->init_layout.base;
+
+	do_mod_ctors(mod);
+	/* Start the module */
+	if (mod->init != NULL)
+		ret = do_one_initcall(mod->init);
+	if (ret < 0) {
+		goto fail_free_freeinit;
+	}
+	if (ret > 0) {
+		pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+			"follow 0/-E convention\n"
+			"%s: loading module anyway...\n",
+			__func__, mod->name, ret, __func__);
+		dump_stack();
+	}
+
+	/* Now it's a first class citizen! */
+	mod->state = MODULE_STATE_LIVE;
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_LIVE, mod);
+
+	/* Delay uevent until module has finished its init routine */
+	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
+	/*
+	 * We need to finish all async code before the module init sequence
+	 * is done. This has potential to deadlock if synchronous module
+	 * loading is requested from async (which is not allowed!).
+	 *
+	 * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+	 * request_module() from async workers") for more details.
+	 */
+	if (!mod->async_probe_requested)
+		async_synchronize_full();
+
+	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+			mod->init_layout.size);
+	mutex_lock(&module_mutex);
+	/* Drop initial reference. */
+	module_put(mod);
+	trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
+	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
+#endif
+	module_enable_ro(mod, true);
+	mod_tree_remove_init(mod);
+	module_arch_freeing_init(mod);
+	mod->init_layout.base = NULL;
+	mod->init_layout.size = 0;
+	mod->init_layout.ro_size = 0;
+	mod->init_layout.ro_after_init_size = 0;
+	mod->init_layout.text_size = 0;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+	/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
+	mod->btf_data = NULL;
+#endif
+	/*
+	 * We want to free module_init, but be aware that kallsyms may be
+	 * walking this with preempt disabled.  In all the failure paths, we
+	 * call synchronize_rcu(), but we don't want to slow down the success
+	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * work and call synchronize_rcu() in a work queue.
+	 *
+	 * Note that module_alloc() on most architectures creates W+X page
+	 * mappings which won't be cleaned up until do_free_init() runs.  Any
+	 * code such as mark_rodata_ro() which depends on those mappings to
+	 * be cleaned up needs to sync with the queued work - ie
+	 * rcu_barrier()
+	 */
+	if (llist_add(&freeinit->node, &init_free_list))
+		schedule_work(&init_free_wq);
+
+	mutex_unlock(&module_mutex);
+	wake_up_all(&module_wq);
+
+	return 0;
+
+fail_free_freeinit:
+	kfree(freeinit);
+fail:
+	/* Try to protect us from buggy refcounters. */
+	mod->state = MODULE_STATE_GOING;
+	synchronize_rcu();
+	module_put(mod);
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_GOING, mod);
+	klp_module_going(mod);
+	ftrace_release_mod(mod);
+	free_module(mod);
+	wake_up_all(&module_wq);
+	return ret;
+}
+
+static int may_init_module(void)
+{
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources.  In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+	int err;
+	struct module *old;
+
+	mod->state = MODULE_STATE_UNFORMED;
+
+again:
+	mutex_lock(&module_mutex);
+	old = find_module_all(mod->name, strlen(mod->name), true);
+	if (old != NULL) {
+		if (old->state != MODULE_STATE_LIVE) {
+			/* Wait in case it fails to load. */
+			mutex_unlock(&module_mutex);
+			err = wait_event_interruptible(module_wq,
+					       finished_loading(mod->name));
+			if (err)
+				goto out_unlocked;
+			goto again;
+		}
+		err = -EEXIST;
+		goto out;
+	}
+	mod_update_bounds(mod);
+	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
+	err = 0;
+
+out:
+	mutex_unlock(&module_mutex);
+out_unlocked:
+	return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+	int err;
+
+	mutex_lock(&module_mutex);
+
+	/* Find duplicate symbols (must be called under lock). */
+	err = verify_exported_symbols(mod);
+	if (err < 0)
+		goto out;
+
+	/* This relies on module_mutex for list integrity. */
+	module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+	module_enable_ro(mod, false);
+	module_enable_nx(mod);
+	module_enable_x(mod);
+
+	/*
+	 * Mark state as coming so strong_try_module_get() ignores us,
+	 * but kallsyms etc. can see us.
+	 */
+	mod->state = MODULE_STATE_COMING;
+	mutex_unlock(&module_mutex);
+
+	return 0;
+
+out:
+	mutex_unlock(&module_mutex);
+	return err;
+}
+
+static int prepare_coming_module(struct module *mod)
+{
+	int err;
+
+	ftrace_module_enable(mod);
+	err = klp_module_coming(mod);
+	if (err)
+		return err;
+
+	err = blocking_notifier_call_chain_robust(&module_notify_list,
+			MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+	err = notifier_to_errno(err);
+	if (err)
+		klp_module_going(mod);
+
+	return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+				   void *arg)
+{
+	struct module *mod = arg;
+	int ret;
+
+	if (strcmp(param, "async_probe") == 0) {
+		mod->async_probe_requested = true;
+		return 0;
+	}
+
+	/* Check for magic 'dyndbg' arg */
+	ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+	if (ret != 0)
+		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+	return 0;
+}
+
+static void cfi_init(struct module *mod);
+
+/*
+ * Allocate and load the module: note that size of section 0 is always
+ * zero, and we rely on this for optional sections.
+ */
+static int load_module(struct load_info *info, const char __user *uargs,
+		       int flags)
+{
+	struct module *mod;
+	long err = 0;
+	char *after_dashes;
+
+	/*
+	 * Do the signature check (if any) first. All that
+	 * the signature check needs is info->len, it does
+	 * not need any of the section info. That can be
+	 * set up later. This will minimize the chances
+	 * of a corrupt module causing problems before
+	 * we even get to the signature check.
+	 *
+	 * The check will also adjust info->len by stripping
+	 * off the sig length at the end of the module, making
+	 * checks against info->len more correct.
+	 */
+	err = module_sig_check(info, flags);
+	if (err)
+		goto free_copy;
+
+	/*
+	 * Do basic sanity checks against the ELF header and
+	 * sections.
+	 */
+	err = elf_validity_check(info);
+	if (err)
+		goto free_copy;
+
+	/*
+	 * Everything checks out, so set up the section info
+	 * in the info structure.
+	 */
+	err = setup_load_info(info, flags);
+	if (err)
+		goto free_copy;
+
+	/*
+	 * Now that we know we have the correct module name, check
+	 * if it's blacklisted.
+	 */
+	if (blacklisted(info->name)) {
+		err = -EPERM;
+		pr_err("Module %s is blacklisted\n", info->name);
+		goto free_copy;
+	}
+
+	err = rewrite_section_headers(info, flags);
+	if (err)
+		goto free_copy;
+
+	/* Check module struct version now, before we try to use module. */
+	if (!check_modstruct_version(info, info->mod)) {
+		err = -ENOEXEC;
+		goto free_copy;
+	}
+
+	/* Figure out module layout, and allocate all the memory. */
+	mod = layout_and_allocate(info, flags);
+	if (IS_ERR(mod)) {
+		err = PTR_ERR(mod);
+		goto free_copy;
+	}
+
+	audit_log_kern_module(mod->name);
+
+	/* Reserve our place in the list. */
+	err = add_unformed_module(mod);
+	if (err)
+		goto free_module;
+
+#ifdef CONFIG_MODULE_SIG
+	mod->sig_ok = info->sig_ok;
+	if (!mod->sig_ok) {
+		pr_notice_once("%s: module verification failed: signature "
+			       "and/or required key missing - tainting "
+			       "kernel\n", mod->name);
+		add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+	}
+#endif
+
+	/* To avoid stressing percpu allocator, do this once we're unique. */
+	err = percpu_modalloc(mod, info);
+	if (err)
+		goto unlink_mod;
+
+	/* Now module is in final location, initialize linked lists, etc. */
+	err = module_unload_init(mod);
+	if (err)
+		goto unlink_mod;
+
+	init_param_lock(mod);
+
+	/*
+	 * Now we've got everything in the final locations, we can
+	 * find optional sections.
+	 */
+	err = find_module_sections(mod, info);
+	if (err)
+		goto free_unload;
+
+	err = check_module_license_and_versions(mod);
+	if (err)
+		goto free_unload;
+
+	/* Set up MODINFO_ATTR fields */
+	setup_modinfo(mod, info);
+
+	/* Fix up syms, so that st_value is a pointer to location. */
+	err = simplify_symbols(mod, info);
+	if (err < 0)
+		goto free_modinfo;
+
+	err = apply_relocations(mod, info);
+	if (err < 0)
+		goto free_modinfo;
+
+	err = post_relocation(mod, info);
+	if (err < 0)
+		goto free_modinfo;
+
+	flush_module_icache(mod);
+
+	/* Setup CFI for the module. */
+	cfi_init(mod);
+
+	/* Now copy in args */
+	mod->args = strndup_user(uargs, ~0UL >> 1);
+	if (IS_ERR(mod->args)) {
+		err = PTR_ERR(mod->args);
+		goto free_arch_cleanup;
+	}
+
+	init_build_id(mod, info);
+	dynamic_debug_setup(mod, info->debug, info->num_debug);
+
+	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+	ftrace_module_init(mod);
+
+	/* Finally it's fully formed, ready to start executing. */
+	err = complete_formation(mod, info);
+	if (err)
+		goto ddebug_cleanup;
+
+	err = prepare_coming_module(mod);
+	if (err)
+		goto bug_cleanup;
+
+	/* Module is ready to execute: parsing args may do that. */
+	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+				  -32768, 32767, mod,
+				  unknown_module_param_cb);
+	if (IS_ERR(after_dashes)) {
+		err = PTR_ERR(after_dashes);
+		goto coming_cleanup;
+	} else if (after_dashes) {
+		pr_warn("%s: parameters '%s' after `--' ignored\n",
+		       mod->name, after_dashes);
+	}
+
+	/* Link in to sysfs. */
+	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+	if (err < 0)
+		goto coming_cleanup;
+
+	if (is_livepatch_module(mod)) {
+		err = copy_module_elf(mod, info);
+		if (err < 0)
+			goto sysfs_cleanup;
+	}
+
+	/* Get rid of temporary copy. */
+	free_copy(info, flags);
+
+	/* Done! */
+	trace_module_load(mod);
+
+	return do_init_module(mod);
+
+ sysfs_cleanup:
+	mod_sysfs_teardown(mod);
+ coming_cleanup:
+	mod->state = MODULE_STATE_GOING;
+	destroy_params(mod->kp, mod->num_kp);
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_GOING, mod);
+	klp_module_going(mod);
+ bug_cleanup:
+	mod->state = MODULE_STATE_GOING;
+	/* module_bug_cleanup needs module_mutex protection */
+	mutex_lock(&module_mutex);
+	module_bug_cleanup(mod);
+	mutex_unlock(&module_mutex);
+
+ ddebug_cleanup:
+	ftrace_release_mod(mod);
+	dynamic_debug_remove(mod, info->debug);
+	synchronize_rcu();
+	kfree(mod->args);
+ free_arch_cleanup:
+	cfi_cleanup(mod);
+	module_arch_cleanup(mod);
+ free_modinfo:
+	free_modinfo(mod);
+ free_unload:
+	module_unload_free(mod);
+ unlink_mod:
+	mutex_lock(&module_mutex);
+	/* Unlink carefully: kallsyms could be walking list. */
+	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
+	wake_up_all(&module_wq);
+	/* Wait for RCU-sched synchronizing before releasing mod->list. */
+	synchronize_rcu();
+	mutex_unlock(&module_mutex);
+ free_module:
+	/* Free lock-classes; relies on the preceding sync_rcu() */
+	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+
+	module_deallocate(mod, info);
+ free_copy:
+	free_copy(info, flags);
+	return err;
+}
+
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+		unsigned long, len, const char __user *, uargs)
+{
+	int err;
+	struct load_info info = { };
+
+	err = may_init_module();
+	if (err)
+		return err;
+
+	pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+	       umod, len, uargs);
+
+	err = copy_module_from_user(umod, len, &info);
+	if (err)
+		return err;
+
+	return load_module(&info, uargs, 0);
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+	struct load_info info = { };
+	void *buf = NULL;
+	int len;
+	int err;
+
+	err = may_init_module();
+	if (err)
+		return err;
+
+	pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+	if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+		      |MODULE_INIT_IGNORE_VERMAGIC
+		      |MODULE_INIT_COMPRESSED_FILE))
+		return -EINVAL;
+
+	len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
+				       READING_MODULE);
+	if (len < 0)
+		return len;
+
+	if (flags & MODULE_INIT_COMPRESSED_FILE) {
+		err = module_decompress(&info, buf, len);
+		vfree(buf); /* compressed data is no longer needed */
+		if (err)
+			return err;
+	} else {
+		info.hdr = buf;
+		info.len = len;
+	}
+
+	return load_module(&info, uargs, flags);
+}
+
+static inline int within(unsigned long addr, void *start, unsigned long size)
+{
+	return ((void *)addr >= start && (void *)addr < start + size);
+}
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+	if (str[0] == '.' && str[1] == 'L')
+		return true;
+	return str[0] == '$' && strchr("axtd", str[1])
+	       && (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+					unsigned long addr,
+					unsigned long *size,
+					unsigned long *offset)
+{
+	unsigned int i, best = 0;
+	unsigned long nextval, bestval;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+	/* At worse, next value is at end of module */
+	if (within_module_init(addr, mod))
+		nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
+	else
+		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
+
+	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+	/*
+	 * Scan for closest preceding symbol, and next symbol. (ELF
+	 * starts real symbols at 1).
+	 */
+	for (i = 1; i < kallsyms->num_symtab; i++) {
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+		unsigned long thisval = kallsyms_symbol_value(sym);
+
+		if (sym->st_shndx == SHN_UNDEF)
+			continue;
+
+		/*
+		 * We ignore unnamed symbols: they're uninformative
+		 * and inserted at a whim.
+		 */
+		if (*kallsyms_symbol_name(kallsyms, i) == '\0'
+		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+			continue;
+
+		if (thisval <= addr && thisval > bestval) {
+			best = i;
+			bestval = thisval;
+		}
+		if (thisval > addr && thisval < nextval)
+			nextval = thisval;
+	}
+
+	if (!best)
+		return NULL;
+
+	if (size)
+		*size = nextval - bestval;
+	if (offset)
+		*offset = addr - bestval;
+
+	return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+						     void *ptr)
+{
+	return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution.  NULL means not found.  Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
+const char *module_address_lookup(unsigned long addr,
+			    unsigned long *size,
+			    unsigned long *offset,
+			    char **modname,
+			    const unsigned char **modbuildid,
+			    char *namebuf)
+{
+	const char *ret = NULL;
+	struct module *mod;
+
+	preempt_disable();
+	mod = __module_address(addr);
+	if (mod) {
+		if (modname)
+			*modname = mod->name;
+		if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+			*modbuildid = mod->build_id;
+#else
+			*modbuildid = NULL;
+#endif
+		}
+
+		ret = find_kallsyms_symbol(mod, addr, size, offset);
+	}
+	/* Make a copy in here where it's safe */
+	if (ret) {
+		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+		ret = namebuf;
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (within_module(addr, mod)) {
+			const char *sym;
+
+			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+			if (!sym)
+				goto out;
+
+			strlcpy(symname, sym, KSYM_NAME_LEN);
+			preempt_enable();
+			return 0;
+		}
+	}
+out:
+	preempt_enable();
+	return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+			unsigned long *offset, char *modname, char *name)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (within_module(addr, mod)) {
+			const char *sym;
+
+			sym = find_kallsyms_symbol(mod, addr, size, offset);
+			if (!sym)
+				goto out;
+			if (modname)
+				strlcpy(modname, mod->name, MODULE_NAME_LEN);
+			if (name)
+				strlcpy(name, sym, KSYM_NAME_LEN);
+			preempt_enable();
+			return 0;
+		}
+	}
+out:
+	preempt_enable();
+	return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+			char *name, char *module_name, int *exported)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		struct mod_kallsyms *kallsyms;
+
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		kallsyms = rcu_dereference_sched(mod->kallsyms);
+		if (symnum < kallsyms->num_symtab) {
+			const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+			*value = kallsyms_symbol_value(sym);
+			*type = kallsyms->typetab[symnum];
+			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
+			*exported = is_exported(name, *value, mod);
+			preempt_enable();
+			return 0;
+		}
+		symnum -= kallsyms->num_symtab;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+	unsigned int i;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+	for (i = 0; i < kallsyms->num_symtab; i++) {
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+
+		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+		    sym->st_shndx != SHN_UNDEF)
+			return kallsyms_symbol_value(sym);
+	}
+	return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	struct module *mod;
+	char *colon;
+	unsigned long ret = 0;
+
+	/* Don't lock: we're in enough trouble already. */
+	preempt_disable();
+	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+		if ((mod = find_module_all(name, colon - name, false)) != NULL)
+			ret = find_kallsyms_symbol_value(mod, colon+1);
+	} else {
+		list_for_each_entry_rcu(mod, &modules, list) {
+			if (mod->state == MODULE_STATE_UNFORMED)
+				continue;
+			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
+				break;
+		}
+	}
+	preempt_enable();
+	return ret;
+}
+
+#ifdef CONFIG_LIVEPATCH
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		/* We hold module_mutex: no need for rcu_dereference_sched */
+		struct mod_kallsyms *kallsyms = mod->kallsyms;
+
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		for (i = 0; i < kallsyms->num_symtab; i++) {
+			const Elf_Sym *sym = &kallsyms->symtab[i];
+
+			if (sym->st_shndx == SHN_UNDEF)
+				continue;
+
+			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+				 mod, kallsyms_symbol_value(sym));
+			if (ret != 0)
+				goto out;
+
+			cond_resched();
+		}
+	}
+out:
+	mutex_unlock(&module_mutex);
+	return ret;
+}
+#endif /* CONFIG_LIVEPATCH */
+#endif /* CONFIG_KALLSYMS */
+
+static void cfi_init(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+	initcall_t *init;
+	exitcall_t *exit;
+
+	rcu_read_lock_sched();
+	mod->cfi_check = (cfi_check_fn)
+		find_kallsyms_symbol_value(mod, "__cfi_check");
+	init = (initcall_t *)
+		find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
+	exit = (exitcall_t *)
+		find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
+	rcu_read_unlock_sched();
+
+	/* Fix init/exit functions to point to the CFI jump table */
+	if (init)
+		mod->init = *init;
+#ifdef CONFIG_MODULE_UNLOAD
+	if (exit)
+		mod->exit = *exit;
+#endif
+
+	cfi_module_add(mod, module_addr_min);
+#endif
+}
+
+static void cfi_cleanup(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+	cfi_module_remove(mod, module_addr_min);
+#endif
+}
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
+static char *module_flags(struct module *mod, char *buf)
+{
+	int bx = 0;
+
+	BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+	if (mod->taints ||
+	    mod->state == MODULE_STATE_GOING ||
+	    mod->state == MODULE_STATE_COMING) {
+		buf[bx++] = '(';
+		bx += module_flags_taint(mod, buf + bx);
+		/* Show a - for module-is-being-unloaded */
+		if (mod->state == MODULE_STATE_GOING)
+			buf[bx++] = '-';
+		/* Show a + for module-is-being-loaded */
+		if (mod->state == MODULE_STATE_COMING)
+			buf[bx++] = '+';
+		buf[bx++] = ')';
+	}
+	buf[bx] = '\0';
+
+	return buf;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&module_mutex);
+	return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+	struct module *mod = list_entry(p, struct module, list);
+	char buf[MODULE_FLAGS_BUF_SIZE];
+	void *value;
+
+	/* We always ignore unformed modules. */
+	if (mod->state == MODULE_STATE_UNFORMED)
+		return 0;
+
+	seq_printf(m, "%s %u",
+		   mod->name, mod->init_layout.size + mod->core_layout.size);
+	print_unload_info(m, mod);
+
+	/* Informative for users. */
+	seq_printf(m, " %s",
+		   mod->state == MODULE_STATE_GOING ? "Unloading" :
+		   mod->state == MODULE_STATE_COMING ? "Loading" :
+		   "Live");
+	/* Used by oprofile and other similar tools. */
+	value = m->private ? NULL : mod->core_layout.base;
+	seq_printf(m, " 0x%px", value);
+
+	/* Taints info */
+	if (mod->taints)
+		seq_printf(m, " %s", module_flags(mod, buf));
+
+	seq_puts(m, "\n");
+	return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+	int err = seq_open(file, &modules_op);
+
+	if (!err) {
+		struct seq_file *m = file->private_data;
+		m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+	}
+
+	return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
+	.proc_open	= modules_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+	proc_create("modules", 0, NULL, &modules_proc_ops);
+	return 0;
+}
+module_init(proc_modules_init);
+#endif
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+	const struct exception_table_entry *e = NULL;
+	struct module *mod;
+
+	preempt_disable();
+	mod = __module_address(addr);
+	if (!mod)
+		goto out;
+
+	if (!mod->num_exentries)
+		goto out;
+
+	e = search_extable(mod->extable,
+			   mod->num_exentries,
+			   addr);
+out:
+	preempt_enable();
+
+	/*
+	 * Now, if we found one, we are running inside it now, hence
+	 * we cannot unload the module, hence no refcnt needed.
+	 */
+	return e;
+}
+
+/**
+ * is_module_address() - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
+ */
+bool is_module_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/**
+ * __module_address() - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
+{
+	struct module *mod;
+
+	if (addr < module_addr_min || addr > module_addr_max)
+		return NULL;
+
+	module_assert_mutex_or_preempt();
+
+	mod = mod_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
+		if (mod->state == MODULE_STATE_UNFORMED)
+			mod = NULL;
+	}
+	return mod;
+}
+
+/**
+ * is_module_text_address() - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/**
+ * __module_text_address() - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+		    && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+	struct module *mod;
+	char buf[MODULE_FLAGS_BUF_SIZE];
+
+	printk(KERN_DEFAULT "Modules linked in:");
+	/* Most callers should already have preempt disabled, but make sure */
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
+	}
+	preempt_enable();
+	if (last_unloaded_module[0])
+		pr_cont(" [last unloaded: %s]", last_unloaded_module);
+	pr_cont("\n");
+}
+
+#ifdef CONFIG_MODVERSIONS
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+		   struct modversion_info *ver,
+		   struct kernel_param *kp,
+		   struct kernel_symbol *ks,
+		   struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
+#endif
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..8aeb6d2ee94b
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <crypto/public_key.h>
+#include "internal.h"
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+	struct module_signature ms;
+	size_t sig_len, modlen = info->len;
+	int ret;
+
+	pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+	if (modlen <= sizeof(ms))
+		return -EBADMSG;
+
+	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+	ret = mod_check_sig(&ms, modlen, "module");
+	if (ret)
+		return ret;
+
+	sig_len = be32_to_cpu(ms.sig_len);
+	modlen -= sig_len + sizeof(ms);
+	info->len = modlen;
+
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      VERIFY_USE_SECONDARY_KEYRING,
+				      VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
+}
diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c
deleted file mode 100644
index ffef98a20320..000000000000
--- a/kernel/module_decompress.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright 2021 Google LLC.
- */
-
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/kobject.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/vmalloc.h>
-
-#include "module-internal.h"
-
-static int module_extend_max_pages(struct load_info *info, unsigned int extent)
-{
-	struct page **new_pages;
-
-	new_pages = kvmalloc_array(info->max_pages + extent,
-				   sizeof(info->pages), GFP_KERNEL);
-	if (!new_pages)
-		return -ENOMEM;
-
-	memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
-	kvfree(info->pages);
-	info->pages = new_pages;
-	info->max_pages += extent;
-
-	return 0;
-}
-
-static struct page *module_get_next_page(struct load_info *info)
-{
-	struct page *page;
-	int error;
-
-	if (info->max_pages == info->used_pages) {
-		error = module_extend_max_pages(info, info->used_pages);
-		if (error)
-			return ERR_PTR(error);
-	}
-
-	page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	info->pages[info->used_pages++] = page;
-	return page;
-}
-
-#ifdef CONFIG_MODULE_COMPRESS_GZIP
-#include <linux/zlib.h>
-#define MODULE_COMPRESSION	gzip
-#define MODULE_DECOMPRESS_FN	module_gzip_decompress
-
-/*
- * Calculate length of the header which consists of signature, header
- * flags, time stamp and operating system ID (10 bytes total), plus
- * an optional filename.
- */
-static size_t module_gzip_header_len(const u8 *buf, size_t size)
-{
-	const u8 signature[] = { 0x1f, 0x8b, 0x08 };
-	size_t len = 10;
-
-	if (size < len || memcmp(buf, signature, sizeof(signature)))
-		return 0;
-
-	if (buf[3] & 0x08) {
-		do {
-			/*
-			 * If we can't find the end of the file name we must
-			 * be dealing with a corrupted file.
-			 */
-			if (len == size)
-				return 0;
-		} while (buf[len++] != '\0');
-	}
-
-	return len;
-}
-
-static ssize_t module_gzip_decompress(struct load_info *info,
-				      const void *buf, size_t size)
-{
-	struct z_stream_s s = { 0 };
-	size_t new_size = 0;
-	size_t gzip_hdr_len;
-	ssize_t retval;
-	int rc;
-
-	gzip_hdr_len = module_gzip_header_len(buf, size);
-	if (!gzip_hdr_len) {
-		pr_err("not a gzip compressed module\n");
-		return -EINVAL;
-	}
-
-	s.next_in = buf + gzip_hdr_len;
-	s.avail_in = size - gzip_hdr_len;
-
-	s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
-	if (!s.workspace)
-		return -ENOMEM;
-
-	rc = zlib_inflateInit2(&s, -MAX_WBITS);
-	if (rc != Z_OK) {
-		pr_err("failed to initialize decompressor: %d\n", rc);
-		retval = -EINVAL;
-		goto out;
-	}
-
-	do {
-		struct page *page = module_get_next_page(info);
-		if (!page) {
-			retval = -ENOMEM;
-			goto out_inflate_end;
-		}
-
-		s.next_out = kmap(page);
-		s.avail_out = PAGE_SIZE;
-		rc = zlib_inflate(&s, 0);
-		kunmap(page);
-
-		new_size += PAGE_SIZE - s.avail_out;
-	} while (rc == Z_OK);
-
-	if (rc != Z_STREAM_END) {
-		pr_err("decompression failed with status %d\n", rc);
-		retval = -EINVAL;
-		goto out_inflate_end;
-	}
-
-	retval = new_size;
-
-out_inflate_end:
-	zlib_inflateEnd(&s);
-out:
-	kfree(s.workspace);
-	return retval;
-}
-#elif CONFIG_MODULE_COMPRESS_XZ
-#include <linux/xz.h>
-#define MODULE_COMPRESSION	xz
-#define MODULE_DECOMPRESS_FN	module_xz_decompress
-
-static ssize_t module_xz_decompress(struct load_info *info,
-				    const void *buf, size_t size)
-{
-	static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 };
-	struct xz_dec *xz_dec;
-	struct xz_buf xz_buf;
-	enum xz_ret xz_ret;
-	size_t new_size = 0;
-	ssize_t retval;
-
-	if (size < sizeof(signature) ||
-	    memcmp(buf, signature, sizeof(signature))) {
-		pr_err("not an xz compressed module\n");
-		return -EINVAL;
-	}
-
-	xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1);
-	if (!xz_dec)
-		return -ENOMEM;
-
-	xz_buf.in_size = size;
-	xz_buf.in = buf;
-	xz_buf.in_pos = 0;
-
-	do {
-		struct page *page = module_get_next_page(info);
-		if (!page) {
-			retval = -ENOMEM;
-			goto out;
-		}
-
-		xz_buf.out = kmap(page);
-		xz_buf.out_pos = 0;
-		xz_buf.out_size = PAGE_SIZE;
-		xz_ret = xz_dec_run(xz_dec, &xz_buf);
-		kunmap(page);
-
-		new_size += xz_buf.out_pos;
-	} while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
-
-	if (xz_ret != XZ_STREAM_END) {
-		pr_err("decompression failed with status %d\n", xz_ret);
-		retval = -EINVAL;
-		goto out;
-	}
-
-	retval = new_size;
-
- out:
-	xz_dec_end(xz_dec);
-	return retval;
-}
-#else
-#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
-#endif
-
-int module_decompress(struct load_info *info, const void *buf, size_t size)
-{
-	unsigned int n_pages;
-	ssize_t data_size;
-	int error;
-
-	/*
-	 * Start with number of pages twice as big as needed for
-	 * compressed data.
-	 */
-	n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
-	error = module_extend_max_pages(info, n_pages);
-
-	data_size = MODULE_DECOMPRESS_FN(info, buf, size);
-	if (data_size < 0) {
-		error = data_size;
-		goto err;
-	}
-
-	info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL);
-	if (!info->hdr) {
-		error = -ENOMEM;
-		goto err;
-	}
-
-	info->len = data_size;
-	return 0;
-
-err:
-	module_decompress_cleanup(info);
-	return error;
-}
-
-void module_decompress_cleanup(struct load_info *info)
-{
-	int i;
-
-	if (info->hdr)
-		vunmap(info->hdr);
-
-	for (i = 0; i < info->used_pages; i++)
-		__free_page(info->pages[i]);
-
-	kvfree(info->pages);
-
-	info->pages = NULL;
-	info->max_pages = info->used_pages = 0;
-}
-
-#ifdef CONFIG_SYSFS
-static ssize_t compression_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
-}
-static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
-
-static int __init module_decompress_sysfs_init(void)
-{
-	int error;
-
-	error = sysfs_create_file(&module_kset->kobj,
-				  &module_compression_attr.attr);
-	if (error)
-		pr_warn("Failed to create 'compression' attribute");
-
-	return 0;
-}
-late_initcall(module_decompress_sysfs_init);
-#endif
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index 8723ae70ea1f..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/module_signature.h>
-#include <linux/string.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
-#include "module-internal.h"
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, struct load_info *info)
-{
-	struct module_signature ms;
-	size_t sig_len, modlen = info->len;
-	int ret;
-
-	pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
-	if (modlen <= sizeof(ms))
-		return -EBADMSG;
-
-	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-
-	ret = mod_check_sig(&ms, modlen, "module");
-	if (ret)
-		return ret;
-
-	sig_len = be32_to_cpu(ms.sig_len);
-	modlen -= sig_len + sizeof(ms);
-	info->len = modlen;
-
-	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      VERIFY_USE_SECONDARY_KEYRING,
-				      VERIFYING_MODULE_SIGNATURE,
-				      NULL, NULL);
-}
-- 
cgit v1.2.3


From 5b6547ed97f4f5dfc23f8e3970af6d11d7b7ed7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Mar 2022 22:03:41 +0100
Subject: sched/core: Fix forceidle balancing

Steve reported that ChromeOS encounters the forceidle balancer being
ran from rt_mutex_setprio()'s balance_callback() invocation and
explodes.

Now, the forceidle balancer gets queued every time the idle task gets
selected, set_next_task(), which is strictly too often.
rt_mutex_setprio() also uses set_next_task() in the 'change' pattern:

	queued = task_on_rq_queued(p); /* p->on_rq == TASK_ON_RQ_QUEUED */
	running = task_current(rq, p); /* rq->curr == p */

	if (queued)
		dequeue_task(...);
	if (running)
		put_prev_task(...);

	/* change task properties */

	if (queued)
		enqueue_task(...);
	if (running)
		set_next_task(...);

However, rt_mutex_setprio() will explicitly not run this pattern on
the idle task (since priority boosting the idle task is quite insane).
Most other 'change' pattern users are pidhash based and would also not
apply to idle.

Also, the change pattern doesn't contain a __balance_callback()
invocation and hence we could have an out-of-band balance-callback,
which *should* trigger the WARN in rq_pin_lock() (which guards against
this exact anti-pattern).

So while none of that explains how this happens, it does indicate that
having it in set_next_task() might not be the most robust option.

Instead, explicitly queue the forceidle balancer from pick_next_task()
when it does indeed result in forceidle selection. Having it here,
ensures it can only be triggered under the __schedule() rq->lock
instance, and hence must be ran from that context.

This also happens to clean up the code a little, so win-win.

Fixes: d2dfa17bc7de ("sched: Trivial forced-newidle balancer")
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: T.J. Alumbaugh <talumbau@chromium.org>
Link: https://lkml.kernel.org/r/20220330160535.GN8939@worktop.programming.kicks-ass.net
---
 kernel/sched/core.c  | 14 ++++++++++----
 kernel/sched/idle.c  |  1 -
 kernel/sched/sched.h |  6 ------
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..017ee7807930 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5752,6 +5752,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
 
 extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
 
+static void queue_core_balance(struct rq *rq);
+
 static struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -5801,7 +5803,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		}
 
 		rq->core_pick = NULL;
-		return next;
+		goto out;
 	}
 
 	put_prev_task_balance(rq, prev, rf);
@@ -5851,7 +5853,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			 */
 			WARN_ON_ONCE(fi_before);
 			task_vruntime_update(rq, next, false);
-			goto done;
+			goto out_set_next;
 		}
 	}
 
@@ -5970,8 +5972,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		resched_curr(rq_i);
 	}
 
-done:
+out_set_next:
 	set_next_task(rq, next);
+out:
+	if (rq->core->core_forceidle_count && next == rq->idle)
+		queue_core_balance(rq);
+
 	return next;
 }
 
@@ -6066,7 +6072,7 @@ static void sched_core_balance(struct rq *rq)
 
 static DEFINE_PER_CPU(struct callback_head, core_balance_head);
 
-void queue_core_balance(struct rq *rq)
+static void queue_core_balance(struct rq *rq)
 {
 	if (!sched_core_enabled(rq))
 		return;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f8b5020e76a..ecb0d7052877 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -434,7 +434,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
 {
 	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
-	queue_core_balance(rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..8dccb34eb190 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1232,8 +1232,6 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 	return false;
 }
 
-extern void queue_core_balance(struct rq *rq);
-
 static inline bool sched_core_enqueued(struct task_struct *p)
 {
 	return !RB_EMPTY_NODE(&p->core_node);
@@ -1267,10 +1265,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
 	return &rq->__lock;
 }
 
-static inline void queue_core_balance(struct rq *rq)
-{
-}
-
 static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
 {
 	return true;
-- 
cgit v1.2.3


From 386ef214c3c6ab111d05e1790e79475363abaa05 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Mar 2022 15:51:32 +0100
Subject: sched: Teach the forced-newidle balancer about CPU affinity
 limitation.

try_steal_cookie() looks at task_struct::cpus_mask to decide if the
task could be moved to `this' CPU. It ignores that the task might be in
a migration disabled section while not on the CPU. In this case the task
must not be moved otherwise per-CPU assumption are broken.

Use is_cpu_allowed(), as suggested by Peter Zijlstra, to decide if the a
task can be moved.

Fixes: d2dfa17bc7de6 ("sched: Trivial forced-newidle balancer")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YjNK9El+3fzGmswf@linutronix.de
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017ee7807930..51efaabac3e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6006,7 +6006,7 @@ static bool try_steal_cookie(int this, int that)
 		if (p == src->core_pick || p == src->curr)
 			goto next;
 
-		if (!cpumask_test_cpu(this, &p->cpus_mask))
+		if (!is_cpu_allowed(p, this))
 			goto next;
 
 		if (p->core_occupation > dst->idle->core_occupation)
-- 
cgit v1.2.3


From 0a70045ed8516dfcff4b5728557e1ef3fd017c53 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Wed, 30 Mar 2022 10:43:28 +0200
Subject: entry: Fix compile error in dynamic_irqentry_exit_cond_resched()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel/entry/common.c: In function ‘dynamic_irqentry_exit_cond_resched’:
kernel/entry/common.c:409:14: error: implicit declaration of function ‘static_key_unlikely’; did you mean ‘static_key_enable’? [-Werror=implicit-function-declaration]
  409 |         if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
      |              ^~~~~~~~~~~~~~~~~~~
      |              static_key_enable

static_key_unlikely() should be static_branch_unlikely().

Fixes: 99cf983cc8bca ("sched/preempt: Add PREEMPT_DYNAMIC using static keys")
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20220330084328.1805665-1-svens@linux.ibm.com
---
 kernel/entry/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e57a224d6b79..93c3b86e781c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -392,7 +392,7 @@ DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
 void dynamic_irqentry_exit_cond_resched(void)
 {
-	if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+	if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
 		return;
 	raw_irqentry_exit_cond_resched();
 }
-- 
cgit v1.2.3


From 8fd4ddda2f49a66bf5dd3d0c01966c4b1971308b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 14 Mar 2022 12:49:36 +0100
Subject: static_call: Don't make __static_call_return0 static

System.map shows that vmlinux contains several instances of
__static_call_return0():

	c0004fc0 t __static_call_return0
	c0011518 t __static_call_return0
	c00d8160 t __static_call_return0

arch_static_call_transform() uses the middle one to check whether we are
setting a call to __static_call_return0 or not:

	c0011520 <arch_static_call_transform>:
	c0011520:       3d 20 c0 01     lis     r9,-16383	<== r9 =  0xc001 << 16
	c0011524:       39 29 15 18     addi    r9,r9,5400	<== r9 += 0x1518
	c0011528:       7c 05 48 00     cmpw    r5,r9		<== r9 has value 0xc0011518 here

So if static_call_update() is called with one of the other instances of
__static_call_return0(), arch_static_call_transform() won't recognise it.

In order to work properly, global single instance of __static_call_return0() is required.

Fixes: 3f2a8fc4b15d ("static_call/x86: Add __static_call_return0()")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Link: https://lkml.kernel.org/r/30821468a0e7d28251954b578e5051dc09300d04.1647258493.git.christophe.leroy@csgroup.eu
---
 kernel/Makefile             |   3 +-
 kernel/static_call.c        | 541 -------------------------------------------
 kernel/static_call_inline.c | 543 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 545 insertions(+), 542 deletions(-)
 create mode 100644 kernel/static_call_inline.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 471d71935e90..847a82bfe0e3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -114,7 +114,8 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
 obj-$(CONFIG_CFI_CLANG) += cfi.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/static_call.c b/kernel/static_call.c
index f2b8baea35d2..e9c3e69f3837 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -1,549 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
 #include <linux/static_call.h>
-#include <linux/bug.h>
-#include <linux/smp.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/processor.h>
-#include <asm/sections.h>
-
-extern struct static_call_site __start_static_call_sites[],
-			       __stop_static_call_sites[];
-extern struct static_call_tramp_key __start_static_call_tramp_key[],
-				    __stop_static_call_tramp_key[];
-
-static bool static_call_initialized;
-
-/* mutex to protect key modules/sites */
-static DEFINE_MUTEX(static_call_mutex);
-
-static void static_call_lock(void)
-{
-	mutex_lock(&static_call_mutex);
-}
-
-static void static_call_unlock(void)
-{
-	mutex_unlock(&static_call_mutex);
-}
-
-static inline void *static_call_addr(struct static_call_site *site)
-{
-	return (void *)((long)site->addr + (long)&site->addr);
-}
-
-static inline unsigned long __static_call_key(const struct static_call_site *site)
-{
-	return (long)site->key + (long)&site->key;
-}
-
-static inline struct static_call_key *static_call_key(const struct static_call_site *site)
-{
-	return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
-}
-
-/* These assume the key is word-aligned. */
-static inline bool static_call_is_init(struct static_call_site *site)
-{
-	return __static_call_key(site) & STATIC_CALL_SITE_INIT;
-}
-
-static inline bool static_call_is_tail(struct static_call_site *site)
-{
-	return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
-}
-
-static inline void static_call_set_init(struct static_call_site *site)
-{
-	site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
-		    (long)&site->key;
-}
-
-static int static_call_site_cmp(const void *_a, const void *_b)
-{
-	const struct static_call_site *a = _a;
-	const struct static_call_site *b = _b;
-	const struct static_call_key *key_a = static_call_key(a);
-	const struct static_call_key *key_b = static_call_key(b);
-
-	if (key_a < key_b)
-		return -1;
-
-	if (key_a > key_b)
-		return 1;
-
-	return 0;
-}
-
-static void static_call_site_swap(void *_a, void *_b, int size)
-{
-	long delta = (unsigned long)_a - (unsigned long)_b;
-	struct static_call_site *a = _a;
-	struct static_call_site *b = _b;
-	struct static_call_site tmp = *a;
-
-	a->addr = b->addr  - delta;
-	a->key  = b->key   - delta;
-
-	b->addr = tmp.addr + delta;
-	b->key  = tmp.key  + delta;
-}
-
-static inline void static_call_sort_entries(struct static_call_site *start,
-					    struct static_call_site *stop)
-{
-	sort(start, stop - start, sizeof(struct static_call_site),
-	     static_call_site_cmp, static_call_site_swap);
-}
-
-static inline bool static_call_key_has_mods(struct static_call_key *key)
-{
-	return !(key->type & 1);
-}
-
-static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
-{
-	if (!static_call_key_has_mods(key))
-		return NULL;
-
-	return key->mods;
-}
-
-static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
-{
-	if (static_call_key_has_mods(key))
-		return NULL;
-
-	return (struct static_call_site *)(key->type & ~1);
-}
-
-void __static_call_update(struct static_call_key *key, void *tramp, void *func)
-{
-	struct static_call_site *site, *stop;
-	struct static_call_mod *site_mod, first;
-
-	cpus_read_lock();
-	static_call_lock();
-
-	if (key->func == func)
-		goto done;
-
-	key->func = func;
-
-	arch_static_call_transform(NULL, tramp, func, false);
-
-	/*
-	 * If uninitialized, we'll not update the callsites, but they still
-	 * point to the trampoline and we just patched that.
-	 */
-	if (WARN_ON_ONCE(!static_call_initialized))
-		goto done;
-
-	first = (struct static_call_mod){
-		.next = static_call_key_next(key),
-		.mod = NULL,
-		.sites = static_call_key_sites(key),
-	};
-
-	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
-		bool init = system_state < SYSTEM_RUNNING;
-		struct module *mod = site_mod->mod;
-
-		if (!site_mod->sites) {
-			/*
-			 * This can happen if the static call key is defined in
-			 * a module which doesn't use it.
-			 *
-			 * It also happens in the has_mods case, where the
-			 * 'first' entry has no sites associated with it.
-			 */
-			continue;
-		}
-
-		stop = __stop_static_call_sites;
-
-		if (mod) {
-#ifdef CONFIG_MODULES
-			stop = mod->static_call_sites +
-			       mod->num_static_call_sites;
-			init = mod->state == MODULE_STATE_COMING;
-#endif
-		}
-
-		for (site = site_mod->sites;
-		     site < stop && static_call_key(site) == key; site++) {
-			void *site_addr = static_call_addr(site);
-
-			if (!init && static_call_is_init(site))
-				continue;
-
-			if (!kernel_text_address((unsigned long)site_addr)) {
-				/*
-				 * This skips patching built-in __exit, which
-				 * is part of init_section_contains() but is
-				 * not part of kernel_text_address().
-				 *
-				 * Skipping built-in __exit is fine since it
-				 * will never be executed.
-				 */
-				WARN_ONCE(!static_call_is_init(site),
-					  "can't patch static call site at %pS",
-					  site_addr);
-				continue;
-			}
-
-			arch_static_call_transform(site_addr, NULL, func,
-						   static_call_is_tail(site));
-		}
-	}
-
-done:
-	static_call_unlock();
-	cpus_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__static_call_update);
-
-static int __static_call_init(struct module *mod,
-			      struct static_call_site *start,
-			      struct static_call_site *stop)
-{
-	struct static_call_site *site;
-	struct static_call_key *key, *prev_key = NULL;
-	struct static_call_mod *site_mod;
-
-	if (start == stop)
-		return 0;
-
-	static_call_sort_entries(start, stop);
-
-	for (site = start; site < stop; site++) {
-		void *site_addr = static_call_addr(site);
-
-		if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
-		    (!mod && init_section_contains(site_addr, 1)))
-			static_call_set_init(site);
-
-		key = static_call_key(site);
-		if (key != prev_key) {
-			prev_key = key;
-
-			/*
-			 * For vmlinux (!mod) avoid the allocation by storing
-			 * the sites pointer in the key itself. Also see
-			 * __static_call_update()'s @first.
-			 *
-			 * This allows architectures (eg. x86) to call
-			 * static_call_init() before memory allocation works.
-			 */
-			if (!mod) {
-				key->sites = site;
-				key->type |= 1;
-				goto do_transform;
-			}
-
-			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
-			if (!site_mod)
-				return -ENOMEM;
-
-			/*
-			 * When the key has a direct sites pointer, extract
-			 * that into an explicit struct static_call_mod, so we
-			 * can have a list of modules.
-			 */
-			if (static_call_key_sites(key)) {
-				site_mod->mod = NULL;
-				site_mod->next = NULL;
-				site_mod->sites = static_call_key_sites(key);
-
-				key->mods = site_mod;
-
-				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
-				if (!site_mod)
-					return -ENOMEM;
-			}
-
-			site_mod->mod = mod;
-			site_mod->sites = site;
-			site_mod->next = static_call_key_next(key);
-			key->mods = site_mod;
-		}
-
-do_transform:
-		arch_static_call_transform(site_addr, NULL, key->func,
-				static_call_is_tail(site));
-	}
-
-	return 0;
-}
-
-static int addr_conflict(struct static_call_site *site, void *start, void *end)
-{
-	unsigned long addr = (unsigned long)static_call_addr(site);
-
-	if (addr <= (unsigned long)end &&
-	    addr + CALL_INSN_SIZE > (unsigned long)start)
-		return 1;
-
-	return 0;
-}
-
-static int __static_call_text_reserved(struct static_call_site *iter_start,
-				       struct static_call_site *iter_stop,
-				       void *start, void *end, bool init)
-{
-	struct static_call_site *iter = iter_start;
-
-	while (iter < iter_stop) {
-		if (init || !static_call_is_init(iter)) {
-			if (addr_conflict(iter, start, end))
-				return 1;
-		}
-		iter++;
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_MODULES
-
-static int __static_call_mod_text_reserved(void *start, void *end)
-{
-	struct module *mod;
-	int ret;
-
-	preempt_disable();
-	mod = __module_text_address((unsigned long)start);
-	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
-	if (!try_module_get(mod))
-		mod = NULL;
-	preempt_enable();
-
-	if (!mod)
-		return 0;
-
-	ret = __static_call_text_reserved(mod->static_call_sites,
-			mod->static_call_sites + mod->num_static_call_sites,
-			start, end, mod->state == MODULE_STATE_COMING);
-
-	module_put(mod);
-
-	return ret;
-}
-
-static unsigned long tramp_key_lookup(unsigned long addr)
-{
-	struct static_call_tramp_key *start = __start_static_call_tramp_key;
-	struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
-	struct static_call_tramp_key *tramp_key;
-
-	for (tramp_key = start; tramp_key != stop; tramp_key++) {
-		unsigned long tramp;
-
-		tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
-		if (tramp == addr)
-			return (long)tramp_key->key + (long)&tramp_key->key;
-	}
-
-	return 0;
-}
-
-static int static_call_add_module(struct module *mod)
-{
-	struct static_call_site *start = mod->static_call_sites;
-	struct static_call_site *stop = start + mod->num_static_call_sites;
-	struct static_call_site *site;
-
-	for (site = start; site != stop; site++) {
-		unsigned long s_key = __static_call_key(site);
-		unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
-		unsigned long key;
-
-		/*
-		 * Is the key is exported, 'addr' points to the key, which
-		 * means modules are allowed to call static_call_update() on
-		 * it.
-		 *
-		 * Otherwise, the key isn't exported, and 'addr' points to the
-		 * trampoline so we need to lookup the key.
-		 *
-		 * We go through this dance to prevent crazy modules from
-		 * abusing sensitive static calls.
-		 */
-		if (!kernel_text_address(addr))
-			continue;
-
-		key = tramp_key_lookup(addr);
-		if (!key) {
-			pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
-				static_call_addr(site));
-			return -EINVAL;
-		}
-
-		key |= s_key & STATIC_CALL_SITE_FLAGS;
-		site->key = key - (long)&site->key;
-	}
-
-	return __static_call_init(mod, start, stop);
-}
-
-static void static_call_del_module(struct module *mod)
-{
-	struct static_call_site *start = mod->static_call_sites;
-	struct static_call_site *stop = mod->static_call_sites +
-					mod->num_static_call_sites;
-	struct static_call_key *key, *prev_key = NULL;
-	struct static_call_mod *site_mod, **prev;
-	struct static_call_site *site;
-
-	for (site = start; site < stop; site++) {
-		key = static_call_key(site);
-		if (key == prev_key)
-			continue;
-
-		prev_key = key;
-
-		for (prev = &key->mods, site_mod = key->mods;
-		     site_mod && site_mod->mod != mod;
-		     prev = &site_mod->next, site_mod = site_mod->next)
-			;
-
-		if (!site_mod)
-			continue;
-
-		*prev = site_mod->next;
-		kfree(site_mod);
-	}
-}
-
-static int static_call_module_notify(struct notifier_block *nb,
-				     unsigned long val, void *data)
-{
-	struct module *mod = data;
-	int ret = 0;
-
-	cpus_read_lock();
-	static_call_lock();
-
-	switch (val) {
-	case MODULE_STATE_COMING:
-		ret = static_call_add_module(mod);
-		if (ret) {
-			WARN(1, "Failed to allocate memory for static calls");
-			static_call_del_module(mod);
-		}
-		break;
-	case MODULE_STATE_GOING:
-		static_call_del_module(mod);
-		break;
-	}
-
-	static_call_unlock();
-	cpus_read_unlock();
-
-	return notifier_from_errno(ret);
-}
-
-static struct notifier_block static_call_module_nb = {
-	.notifier_call = static_call_module_notify,
-};
-
-#else
-
-static inline int __static_call_mod_text_reserved(void *start, void *end)
-{
-	return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
-int static_call_text_reserved(void *start, void *end)
-{
-	bool init = system_state < SYSTEM_RUNNING;
-	int ret = __static_call_text_reserved(__start_static_call_sites,
-			__stop_static_call_sites, start, end, init);
-
-	if (ret)
-		return ret;
-
-	return __static_call_mod_text_reserved(start, end);
-}
-
-int __init static_call_init(void)
-{
-	int ret;
-
-	if (static_call_initialized)
-		return 0;
-
-	cpus_read_lock();
-	static_call_lock();
-	ret = __static_call_init(NULL, __start_static_call_sites,
-				 __stop_static_call_sites);
-	static_call_unlock();
-	cpus_read_unlock();
-
-	if (ret) {
-		pr_err("Failed to allocate memory for static_call!\n");
-		BUG();
-	}
-
-	static_call_initialized = true;
-
-#ifdef CONFIG_MODULES
-	register_module_notifier(&static_call_module_nb);
-#endif
-	return 0;
-}
-early_initcall(static_call_init);
 
 long __static_call_return0(void)
 {
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__static_call_return0);
-
-#ifdef CONFIG_STATIC_CALL_SELFTEST
-
-static int func_a(int x)
-{
-	return x+1;
-}
-
-static int func_b(int x)
-{
-	return x+2;
-}
-
-DEFINE_STATIC_CALL(sc_selftest, func_a);
-
-static struct static_call_data {
-      int (*func)(int);
-      int val;
-      int expect;
-} static_call_data [] __initdata = {
-      { NULL,   2, 3 },
-      { func_b, 2, 4 },
-      { func_a, 2, 3 }
-};
-
-static int __init test_static_call_init(void)
-{
-      int i;
-
-      for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) {
-	      struct static_call_data *scd = &static_call_data[i];
-
-              if (scd->func)
-                      static_call_update(sc_selftest, scd->func);
-
-              WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect);
-      }
-
-      return 0;
-}
-early_initcall(test_static_call_init);
-
-#endif /* CONFIG_STATIC_CALL_SELFTEST */
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
new file mode 100644
index 000000000000..dc5665b62814
--- /dev/null
+++ b/kernel/static_call_inline.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/static_call.h>
+#include <linux/bug.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/processor.h>
+#include <asm/sections.h>
+
+extern struct static_call_site __start_static_call_sites[],
+			       __stop_static_call_sites[];
+extern struct static_call_tramp_key __start_static_call_tramp_key[],
+				    __stop_static_call_tramp_key[];
+
+static bool static_call_initialized;
+
+/* mutex to protect key modules/sites */
+static DEFINE_MUTEX(static_call_mutex);
+
+static void static_call_lock(void)
+{
+	mutex_lock(&static_call_mutex);
+}
+
+static void static_call_unlock(void)
+{
+	mutex_unlock(&static_call_mutex);
+}
+
+static inline void *static_call_addr(struct static_call_site *site)
+{
+	return (void *)((long)site->addr + (long)&site->addr);
+}
+
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+	return (long)site->key + (long)&site->key;
+}
+
+static inline struct static_call_key *static_call_key(const struct static_call_site *site)
+{
+	return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
+}
+
+/* These assume the key is word-aligned. */
+static inline bool static_call_is_init(struct static_call_site *site)
+{
+	return __static_call_key(site) & STATIC_CALL_SITE_INIT;
+}
+
+static inline bool static_call_is_tail(struct static_call_site *site)
+{
+	return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
+}
+
+static inline void static_call_set_init(struct static_call_site *site)
+{
+	site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
+		    (long)&site->key;
+}
+
+static int static_call_site_cmp(const void *_a, const void *_b)
+{
+	const struct static_call_site *a = _a;
+	const struct static_call_site *b = _b;
+	const struct static_call_key *key_a = static_call_key(a);
+	const struct static_call_key *key_b = static_call_key(b);
+
+	if (key_a < key_b)
+		return -1;
+
+	if (key_a > key_b)
+		return 1;
+
+	return 0;
+}
+
+static void static_call_site_swap(void *_a, void *_b, int size)
+{
+	long delta = (unsigned long)_a - (unsigned long)_b;
+	struct static_call_site *a = _a;
+	struct static_call_site *b = _b;
+	struct static_call_site tmp = *a;
+
+	a->addr = b->addr  - delta;
+	a->key  = b->key   - delta;
+
+	b->addr = tmp.addr + delta;
+	b->key  = tmp.key  + delta;
+}
+
+static inline void static_call_sort_entries(struct static_call_site *start,
+					    struct static_call_site *stop)
+{
+	sort(start, stop - start, sizeof(struct static_call_site),
+	     static_call_site_cmp, static_call_site_swap);
+}
+
+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+	return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+	if (!static_call_key_has_mods(key))
+		return NULL;
+
+	return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+	if (static_call_key_has_mods(key))
+		return NULL;
+
+	return (struct static_call_site *)(key->type & ~1);
+}
+
+void __static_call_update(struct static_call_key *key, void *tramp, void *func)
+{
+	struct static_call_site *site, *stop;
+	struct static_call_mod *site_mod, first;
+
+	cpus_read_lock();
+	static_call_lock();
+
+	if (key->func == func)
+		goto done;
+
+	key->func = func;
+
+	arch_static_call_transform(NULL, tramp, func, false);
+
+	/*
+	 * If uninitialized, we'll not update the callsites, but they still
+	 * point to the trampoline and we just patched that.
+	 */
+	if (WARN_ON_ONCE(!static_call_initialized))
+		goto done;
+
+	first = (struct static_call_mod){
+		.next = static_call_key_next(key),
+		.mod = NULL,
+		.sites = static_call_key_sites(key),
+	};
+
+	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+		bool init = system_state < SYSTEM_RUNNING;
+		struct module *mod = site_mod->mod;
+
+		if (!site_mod->sites) {
+			/*
+			 * This can happen if the static call key is defined in
+			 * a module which doesn't use it.
+			 *
+			 * It also happens in the has_mods case, where the
+			 * 'first' entry has no sites associated with it.
+			 */
+			continue;
+		}
+
+		stop = __stop_static_call_sites;
+
+		if (mod) {
+#ifdef CONFIG_MODULES
+			stop = mod->static_call_sites +
+			       mod->num_static_call_sites;
+			init = mod->state == MODULE_STATE_COMING;
+#endif
+		}
+
+		for (site = site_mod->sites;
+		     site < stop && static_call_key(site) == key; site++) {
+			void *site_addr = static_call_addr(site);
+
+			if (!init && static_call_is_init(site))
+				continue;
+
+			if (!kernel_text_address((unsigned long)site_addr)) {
+				/*
+				 * This skips patching built-in __exit, which
+				 * is part of init_section_contains() but is
+				 * not part of kernel_text_address().
+				 *
+				 * Skipping built-in __exit is fine since it
+				 * will never be executed.
+				 */
+				WARN_ONCE(!static_call_is_init(site),
+					  "can't patch static call site at %pS",
+					  site_addr);
+				continue;
+			}
+
+			arch_static_call_transform(site_addr, NULL, func,
+						   static_call_is_tail(site));
+		}
+	}
+
+done:
+	static_call_unlock();
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__static_call_update);
+
+static int __static_call_init(struct module *mod,
+			      struct static_call_site *start,
+			      struct static_call_site *stop)
+{
+	struct static_call_site *site;
+	struct static_call_key *key, *prev_key = NULL;
+	struct static_call_mod *site_mod;
+
+	if (start == stop)
+		return 0;
+
+	static_call_sort_entries(start, stop);
+
+	for (site = start; site < stop; site++) {
+		void *site_addr = static_call_addr(site);
+
+		if ((mod && within_module_init((unsigned long)site_addr, mod)) ||
+		    (!mod && init_section_contains(site_addr, 1)))
+			static_call_set_init(site);
+
+		key = static_call_key(site);
+		if (key != prev_key) {
+			prev_key = key;
+
+			/*
+			 * For vmlinux (!mod) avoid the allocation by storing
+			 * the sites pointer in the key itself. Also see
+			 * __static_call_update()'s @first.
+			 *
+			 * This allows architectures (eg. x86) to call
+			 * static_call_init() before memory allocation works.
+			 */
+			if (!mod) {
+				key->sites = site;
+				key->type |= 1;
+				goto do_transform;
+			}
+
+			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+			if (!site_mod)
+				return -ENOMEM;
+
+			/*
+			 * When the key has a direct sites pointer, extract
+			 * that into an explicit struct static_call_mod, so we
+			 * can have a list of modules.
+			 */
+			if (static_call_key_sites(key)) {
+				site_mod->mod = NULL;
+				site_mod->next = NULL;
+				site_mod->sites = static_call_key_sites(key);
+
+				key->mods = site_mod;
+
+				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+				if (!site_mod)
+					return -ENOMEM;
+			}
+
+			site_mod->mod = mod;
+			site_mod->sites = site;
+			site_mod->next = static_call_key_next(key);
+			key->mods = site_mod;
+		}
+
+do_transform:
+		arch_static_call_transform(site_addr, NULL, key->func,
+				static_call_is_tail(site));
+	}
+
+	return 0;
+}
+
+static int addr_conflict(struct static_call_site *site, void *start, void *end)
+{
+	unsigned long addr = (unsigned long)static_call_addr(site);
+
+	if (addr <= (unsigned long)end &&
+	    addr + CALL_INSN_SIZE > (unsigned long)start)
+		return 1;
+
+	return 0;
+}
+
+static int __static_call_text_reserved(struct static_call_site *iter_start,
+				       struct static_call_site *iter_stop,
+				       void *start, void *end, bool init)
+{
+	struct static_call_site *iter = iter_start;
+
+	while (iter < iter_stop) {
+		if (init || !static_call_is_init(iter)) {
+			if (addr_conflict(iter, start, end))
+				return 1;
+		}
+		iter++;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int __static_call_mod_text_reserved(void *start, void *end)
+{
+	struct module *mod;
+	int ret;
+
+	preempt_disable();
+	mod = __module_text_address((unsigned long)start);
+	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+	if (!try_module_get(mod))
+		mod = NULL;
+	preempt_enable();
+
+	if (!mod)
+		return 0;
+
+	ret = __static_call_text_reserved(mod->static_call_sites,
+			mod->static_call_sites + mod->num_static_call_sites,
+			start, end, mod->state == MODULE_STATE_COMING);
+
+	module_put(mod);
+
+	return ret;
+}
+
+static unsigned long tramp_key_lookup(unsigned long addr)
+{
+	struct static_call_tramp_key *start = __start_static_call_tramp_key;
+	struct static_call_tramp_key *stop = __stop_static_call_tramp_key;
+	struct static_call_tramp_key *tramp_key;
+
+	for (tramp_key = start; tramp_key != stop; tramp_key++) {
+		unsigned long tramp;
+
+		tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp;
+		if (tramp == addr)
+			return (long)tramp_key->key + (long)&tramp_key->key;
+	}
+
+	return 0;
+}
+
+static int static_call_add_module(struct module *mod)
+{
+	struct static_call_site *start = mod->static_call_sites;
+	struct static_call_site *stop = start + mod->num_static_call_sites;
+	struct static_call_site *site;
+
+	for (site = start; site != stop; site++) {
+		unsigned long s_key = __static_call_key(site);
+		unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
+		unsigned long key;
+
+		/*
+		 * Is the key is exported, 'addr' points to the key, which
+		 * means modules are allowed to call static_call_update() on
+		 * it.
+		 *
+		 * Otherwise, the key isn't exported, and 'addr' points to the
+		 * trampoline so we need to lookup the key.
+		 *
+		 * We go through this dance to prevent crazy modules from
+		 * abusing sensitive static calls.
+		 */
+		if (!kernel_text_address(addr))
+			continue;
+
+		key = tramp_key_lookup(addr);
+		if (!key) {
+			pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n",
+				static_call_addr(site));
+			return -EINVAL;
+		}
+
+		key |= s_key & STATIC_CALL_SITE_FLAGS;
+		site->key = key - (long)&site->key;
+	}
+
+	return __static_call_init(mod, start, stop);
+}
+
+static void static_call_del_module(struct module *mod)
+{
+	struct static_call_site *start = mod->static_call_sites;
+	struct static_call_site *stop = mod->static_call_sites +
+					mod->num_static_call_sites;
+	struct static_call_key *key, *prev_key = NULL;
+	struct static_call_mod *site_mod, **prev;
+	struct static_call_site *site;
+
+	for (site = start; site < stop; site++) {
+		key = static_call_key(site);
+		if (key == prev_key)
+			continue;
+
+		prev_key = key;
+
+		for (prev = &key->mods, site_mod = key->mods;
+		     site_mod && site_mod->mod != mod;
+		     prev = &site_mod->next, site_mod = site_mod->next)
+			;
+
+		if (!site_mod)
+			continue;
+
+		*prev = site_mod->next;
+		kfree(site_mod);
+	}
+}
+
+static int static_call_module_notify(struct notifier_block *nb,
+				     unsigned long val, void *data)
+{
+	struct module *mod = data;
+	int ret = 0;
+
+	cpus_read_lock();
+	static_call_lock();
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ret = static_call_add_module(mod);
+		if (ret) {
+			WARN(1, "Failed to allocate memory for static calls");
+			static_call_del_module(mod);
+		}
+		break;
+	case MODULE_STATE_GOING:
+		static_call_del_module(mod);
+		break;
+	}
+
+	static_call_unlock();
+	cpus_read_unlock();
+
+	return notifier_from_errno(ret);
+}
+
+static struct notifier_block static_call_module_nb = {
+	.notifier_call = static_call_module_notify,
+};
+
+#else
+
+static inline int __static_call_mod_text_reserved(void *start, void *end)
+{
+	return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+int static_call_text_reserved(void *start, void *end)
+{
+	bool init = system_state < SYSTEM_RUNNING;
+	int ret = __static_call_text_reserved(__start_static_call_sites,
+			__stop_static_call_sites, start, end, init);
+
+	if (ret)
+		return ret;
+
+	return __static_call_mod_text_reserved(start, end);
+}
+
+int __init static_call_init(void)
+{
+	int ret;
+
+	if (static_call_initialized)
+		return 0;
+
+	cpus_read_lock();
+	static_call_lock();
+	ret = __static_call_init(NULL, __start_static_call_sites,
+				 __stop_static_call_sites);
+	static_call_unlock();
+	cpus_read_unlock();
+
+	if (ret) {
+		pr_err("Failed to allocate memory for static_call!\n");
+		BUG();
+	}
+
+	static_call_initialized = true;
+
+#ifdef CONFIG_MODULES
+	register_module_notifier(&static_call_module_nb);
+#endif
+	return 0;
+}
+early_initcall(static_call_init);
+
+#ifdef CONFIG_STATIC_CALL_SELFTEST
+
+static int func_a(int x)
+{
+	return x+1;
+}
+
+static int func_b(int x)
+{
+	return x+2;
+}
+
+DEFINE_STATIC_CALL(sc_selftest, func_a);
+
+static struct static_call_data {
+      int (*func)(int);
+      int val;
+      int expect;
+} static_call_data [] __initdata = {
+      { NULL,   2, 3 },
+      { func_b, 2, 4 },
+      { func_a, 2, 3 }
+};
+
+static int __init test_static_call_init(void)
+{
+      int i;
+
+      for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) {
+	      struct static_call_data *scd = &static_call_data[i];
+
+              if (scd->func)
+                      static_call_update(sc_selftest, scd->func);
+
+              WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect);
+      }
+
+      return 0;
+}
+early_initcall(test_static_call_init);
+
+#endif /* CONFIG_STATIC_CALL_SELFTEST */
-- 
cgit v1.2.3


From e3265a4386428d3d157d9565bb520aabff8b4bf0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 28 Mar 2022 13:01:12 -0700
Subject: perf/core: Inherit event_caps

It was reported that some perf event setup can make fork failed on
ARM64.  It was the case of a group of mixed hw and sw events and it
failed in perf_event_init_task() due to armpmu_event_init().

The ARM PMU code checks if all the events in a group belong to the
same PMU except for software events.  But it didn't set the event_caps
of inherited events and no longer identify them as software events.
Therefore the test failed in a child process.

A simple reproducer is:

  $ perf stat -e '{cycles,cs,instructions}' perf bench sched messaging
  # Running 'sched/messaging' benchmark:
  perf: fork(): Invalid argument

The perf stat was fine but the perf bench failed in fork().  Let's
inherit the event caps from the parent.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20220328200112.457740-1-namhyung@kernel.org
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index cfde994ce61c..3980efcf931d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11635,6 +11635,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (parent_event)
+		event->event_caps = parent_event->event_caps;
+
 	if (event->attr.sigtrap)
 		atomic_set(&event->event_limit, 1);
 
-- 
cgit v1.2.3


From a0827713e298d021d3c79ae7423aea408f3f7c3a Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 29 Mar 2022 23:45:20 +0800
Subject: perf/core: Don't pass task around when ctx sched in

The current code pass task around for ctx_sched_in(), only
to get perf_cgroup of the task, then update the timestamp
of it and its ancestors and set them to active.

But we can use cpuctx->cgrp to get active perf_cgroup and
its ancestors since cpuctx->cgrp has been set before
ctx_sched_in().

This patch remove the task argument in ctx_sched_in()
and cleanup related code.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220329154523.86438-2-zhouchengming@bytedance.com
---
 kernel/events/core.c | 58 +++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3980efcf931d..65450200691c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type,
-			     struct task_struct *task);
+			     enum event_type_t event_type);
 
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
@@ -801,10 +800,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
-			  struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cgroup *cgrp;
+	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_cgroup *cgrp = cpuctx->cgrp;
 	struct perf_cgroup_info *info;
 	struct cgroup_subsys_state *css;
 
@@ -813,10 +812,10 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	 * ensure we do not access cgroup data
 	 * unless we have the cgroup pinned (css_get)
 	 */
-	if (!task || !ctx->nr_cgroups)
+	if (!cgrp)
 		return;
 
-	cgrp = perf_cgroup_from_task(task, ctx);
+	WARN_ON_ONCE(!ctx->nr_cgroups);
 
 	for (css = &cgrp->css; css; css = css->parent) {
 		cgrp = container_of(css, struct perf_cgroup, css);
@@ -869,14 +868,14 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 			WARN_ON_ONCE(cpuctx->cgrp);
 			/*
 			 * set cgrp before ctxsw in to allow
-			 * event_filter_match() to not have to pass
-			 * task around
+			 * perf_cgroup_set_timestamp() in ctx_sched_in()
+			 * to not have to pass task around
 			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
 			 * because cgorup events are only per-cpu
 			 */
 			cpuctx->cgrp = perf_cgroup_from_task(task,
 							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1118,8 +1117,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
-			  struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
 }
 
@@ -2713,8 +2711,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type,
-	     struct task_struct *task);
+	     enum event_type_t event_type);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			       struct perf_event_context *ctx,
@@ -2730,15 +2727,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
-				struct perf_event_context *ctx,
-				struct task_struct *task)
+				struct perf_event_context *ctx)
 {
-	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 }
 
 /*
@@ -2788,7 +2784,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	else if (ctx_event_type & EVENT_PINNED)
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
@@ -3011,7 +3007,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 		return;
 	}
 
@@ -3020,7 +3016,7 @@ static void __perf_event_enable(struct perf_event *event,
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 		return;
 	}
 
@@ -3865,8 +3861,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type,
-	     struct task_struct *task)
+	     enum event_type_t event_type)
 {
 	int is_active = ctx->is_active;
 
@@ -3878,7 +3873,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (is_active ^ EVENT_TIME) {
 		/* start ctx time */
 		__update_context_time(ctx, false);
-		perf_cgroup_set_timestamp(task, ctx);
+		perf_cgroup_set_timestamp(cpuctx);
 		/*
 		 * CPU-release for the below ->is_active store,
 		 * see __load_acquire() in perf_event_time_now()
@@ -3909,12 +3904,11 @@ ctx_sched_in(struct perf_event_context *ctx,
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type,
-			     struct task_struct *task)
+			     enum event_type_t event_type)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type, task);
+	ctx_sched_in(ctx, cpuctx, event_type);
 }
 
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3956,7 +3950,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	perf_event_sched_in(cpuctx, ctx, task);
+	perf_event_sched_in(cpuctx, ctx);
 
 	if (cpuctx->sched_cb_usage && pmu->sched_task)
 		pmu->sched_task(cpuctx->task_ctx, true);
@@ -4267,7 +4261,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 	if (cpu_event)
 		rotate_ctx(&cpuctx->ctx, cpu_event);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4339,7 +4333,7 @@ static void perf_event_enable_on_exec(int ctxn)
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, event_type);
 	} else {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
-- 
cgit v1.2.3


From 6875186aea5ce09a644758d9193265da1cc187c7 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 29 Mar 2022 23:45:21 +0800
Subject: perf/core: Use perf_cgroup_info->active to check if cgroup is active

Since we use perf_cgroup_set_timestamp() to start cgroup time and
set active to 1, then use update_cgrp_time_from_cpuctx() to stop
cgroup time and set active to 0.

We can use info->active directly to check if cgroup is active.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220329154523.86438-3-zhouchengming@bytedance.com
---
 kernel/events/core.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 65450200691c..a08fb92b3934 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -780,7 +780,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
 	struct perf_cgroup_info *info;
-	struct perf_cgroup *cgrp;
 
 	/*
 	 * ensure we access cgroup data only when needed and
@@ -789,14 +788,12 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	if (!is_cgroup_event(event))
 		return;
 
-	cgrp = perf_cgroup_from_task(current, event->ctx);
+	info = this_cpu_ptr(event->cgrp->info);
 	/*
 	 * Do not update time when cgroup is not active
 	 */
-	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
-		info = this_cpu_ptr(event->cgrp->info);
+	if (info->active)
 		__update_cgrp_time(info, perf_clock(), true);
-	}
 }
 
 static inline void
-- 
cgit v1.2.3


From 96492a6c558acb56124844d1409d9ef8624a0322 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 29 Mar 2022 23:45:22 +0800
Subject: perf/core: Fix perf_cgroup_switch()

There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
in perf_cgroup_switch().

CPU1						CPU2
perf_cgroup_sched_out(prev, next)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    perf_cgroup_switch(prev, PERF_CGROUP_SWOUT)
						cgroup_migrate_execute()
						  task->cgroups = ?
						  perf_cgroup_attach()
						    task_function_call(task, __perf_cgroup_move)
perf_cgroup_sched_in(prev, next)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    perf_cgroup_switch(next, PERF_CGROUP_SWIN)
						__perf_cgroup_move()
						  perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN)

The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
context switch code") want to skip perf_cgroup_switch() when the
perf_cgroup of "prev" and "next" are the same.

But task->cgroups can change in concurrent with context_switch()
in cgroup_migrate_execute(). If cgrp1 == cgrp2 in sched_out(),
cpuctx won't do sched_out. Then task->cgroups changed cause
cgrp1 != cgrp2 in sched_in(), cpuctx will do sched_in. So trigger
WARN_ON_ONCE(cpuctx->cgrp).

Even though __perf_cgroup_move() will be synchronized as the context
switch disables the interrupt, context_switch() still can see the
task->cgroups is changing in the middle, since task->cgroups changed
before sending IPI.

So we have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
unified into perf_cgroup_switch(), to fix the incosistency between
perf_cgroup_sched_out() and perf_cgroup_sched_in().

But we can't just compare prev->cgroups with next->cgroups to decide
whether to skip cpuctx sched_out/in since the prev->cgroups is changing
too. For example:

CPU1					CPU2
					cgroup_migrate_execute()
					  prev->cgroups = ?
					  perf_cgroup_attach()
					    task_function_call(task, __perf_cgroup_move)
perf_cgroup_switch(task)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    cpuctx sched_out/in ...
					task_function_call() will return -ESRCH

In the above example, prev->cgroups changing cause (cgrp1 == cgrp2)
to be true, so skip cpuctx sched_out/in. And later task_function_call()
would return -ESRCH since the prev task isn't running on cpu anymore.
So we would leave perf_events of the old prev->cgroups still sched on
the CPU, which is wrong.

The solution is that we should use cpuctx->cgrp to compare with
the next task's perf_cgroup. Since cpuctx->cgrp can only be changed
on local CPU, and we have irq disabled, we can read cpuctx->cgrp to
compare without holding ctx lock.

Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220329154523.86438-4-zhouchengming@bytedance.com
---
 kernel/events/core.c | 132 ++++++++++-----------------------------------------
 1 file changed, 25 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a08fb92b3934..bdeb41fe7f15 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -824,17 +824,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 
-#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
-
 /*
  * reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
  */
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
 {
+	struct perf_cgroup *cgrp;
 	struct perf_cpu_context *cpuctx, *tmp;
 	struct list_head *list;
 	unsigned long flags;
@@ -845,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	 */
 	local_irq_save(flags);
 
+	cgrp = perf_cgroup_from_task(task, NULL);
+
 	list = this_cpu_ptr(&cgrp_cpuctx_list);
 	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
 		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+		if (READ_ONCE(cpuctx->cgrp) == cgrp)
+			continue;
 
 		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-			/*
-			 * must not be done before ctxswout due
-			 * to event_filter_match() in event_sched_out()
-			 */
-			cpuctx->cgrp = NULL;
-		}
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		/*
+		 * must not be done before ctxswout due
+		 * to update_cgrp_time_from_cpuctx() in
+		 * ctx_sched_out()
+		 */
+		cpuctx->cgrp = cgrp;
+		/*
+		 * set cgrp before ctxsw in to allow
+		 * perf_cgroup_set_timestamp() in ctx_sched_in()
+		 * to not have to pass task around
+		 */
+		cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 
-		if (mode & PERF_CGROUP_SWIN) {
-			WARN_ON_ONCE(cpuctx->cgrp);
-			/*
-			 * set cgrp before ctxsw in to allow
-			 * perf_cgroup_set_timestamp() in ctx_sched_in()
-			 * to not have to pass task around
-			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-			 * because cgorup events are only per-cpu
-			 */
-			cpuctx->cgrp = perf_cgroup_from_task(task,
-							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL);
-		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
@@ -881,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	local_irq_restore(flags);
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
-{
-	struct perf_cgroup *cgrp1;
-	struct perf_cgroup *cgrp2 = NULL;
-
-	rcu_read_lock();
-	/*
-	 * we come here when we know perf_cgroup_events > 0
-	 * we do not need to pass the ctx here because we know
-	 * we are holding the rcu lock
-	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
-	cgrp2 = perf_cgroup_from_task(next, NULL);
-
-	/*
-	 * only schedule out current cgroup events if we know
-	 * that we are switching to a different cgroup. Otherwise,
-	 * do no touch the cgroup events.
-	 */
-	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
-
-	rcu_read_unlock();
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
-{
-	struct perf_cgroup *cgrp1;
-	struct perf_cgroup *cgrp2 = NULL;
-
-	rcu_read_lock();
-	/*
-	 * we come here when we know perf_cgroup_events > 0
-	 * we do not need to pass the ctx here because we know
-	 * we are holding the rcu lock
-	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
-	cgrp2 = perf_cgroup_from_task(prev, NULL);
-
-	/*
-	 * only need to schedule in cgroup events if we are changing
-	 * cgroup during ctxsw. Cgroup events were not scheduled
-	 * out of ctxsw out if that was not the case.
-	 */
-	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
-
-	rcu_read_unlock();
-}
-
 static int perf_cgroup_ensure_storage(struct perf_event *event,
 				struct cgroup_subsys_state *css)
 {
@@ -1096,16 +1035,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
 {
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
-{
-}
-
 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 				      struct perf_event_attr *attr,
 				      struct perf_event *group_leader)
@@ -1118,11 +1047,6 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
 }
 
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
-{
-}
-
 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 {
 	return 0;
@@ -1142,6 +1066,10 @@ static inline void
 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
 {
 }
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
 #endif
 
 /*
@@ -3661,7 +3589,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
 	 * cgroup event are system-wide mode only
 	 */
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_out(task, next);
+		perf_cgroup_switch(next);
 }
 
 /*
@@ -3975,16 +3903,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	struct perf_event_context *ctx;
 	int ctxn;
 
-	/*
-	 * If cgroup events exist on this CPU, then we need to check if we have
-	 * to switch in PMU state; cgroup event are system-wide mode only.
-	 *
-	 * Since cgroup events are CPU events, we must schedule these in before
-	 * we schedule in the task events.
-	 */
-	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_in(prev, task);
-
 	for_each_task_context_nr(ctxn) {
 		ctx = task->perf_event_ctxp[ctxn];
 		if (likely(!ctx))
@@ -13556,7 +13474,7 @@ static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
 	rcu_read_lock();
-	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	perf_cgroup_switch(task);
 	rcu_read_unlock();
 	return 0;
 }
-- 
cgit v1.2.3


From e19cd0b6fa5938c51d7b928010d584f0de93913a Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 29 Mar 2022 23:45:23 +0800
Subject: perf/core: Always set cpuctx cgrp when enable cgroup event

When enable a cgroup event, cpuctx->cgrp setting is conditional
on the current task cgrp matching the event's cgroup, so have to
do it for every new event. It brings complexity but no advantage.

To keep it simple, this patch would always set cpuctx->cgrp
when enable the first cgroup event, and reset to NULL when disable
the last cgroup event.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220329154523.86438-5-zhouchengming@bytedance.com
---
 kernel/events/core.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index bdeb41fe7f15..23bb19716ad3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -967,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
 	 */
 	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
 
-	/*
-	 * Since setting cpuctx->cgrp is conditional on the current @cgrp
-	 * matching the event's cgroup, we must do this for every new event,
-	 * because if the first would mismatch, the second would not try again
-	 * and we would leave cpuctx->cgrp unset.
-	 */
-	if (ctx->is_active && !cpuctx->cgrp) {
-		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
-		if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
-			cpuctx->cgrp = cgrp;
-	}
-
 	if (ctx->nr_cgroups++)
 		return;
 
+	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
 	list_add(&cpuctx->cgrp_cpuctx_entry,
 			per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
 }
@@ -1004,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
 	if (--ctx->nr_cgroups)
 		return;
 
-	if (ctx->is_active && cpuctx->cgrp)
-		cpuctx->cgrp = NULL;
-
+	cpuctx->cgrp = NULL;
 	list_del(&cpuctx->cgrp_cpuctx_entry);
 }
 
-- 
cgit v1.2.3


From 8b023accc8df70e72f7704d29fead7ca914d6837 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 14 Mar 2022 15:19:03 -0700
Subject: lockdep: Fix -Wunused-parameter for _THIS_IP_

While looking into a bug related to the compiler's handling of addresses
of labels, I noticed some uses of _THIS_IP_ seemed unused in lockdep.
Drive by cleanup.

-Wunused-parameter:
kernel/locking/lockdep.c:1383:22: warning: unused parameter 'ip'
kernel/locking/lockdep.c:4246:48: warning: unused parameter 'ip'
kernel/locking/lockdep.c:4844:19: warning: unused parameter 'ip'

Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/20220314221909.2027027-1-ndesaulniers@google.com
---
 kernel/entry/common.c           |  6 +++---
 kernel/locking/lockdep.c        | 22 ++++++++--------------
 kernel/sched/idle.c             |  2 +-
 kernel/trace/trace_preemptirq.c |  4 ++--
 4 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e57a224d6b79..2eef7b22e02a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -126,7 +126,7 @@ static __always_inline void __exit_to_user_mode(void)
 {
 	instrumentation_begin();
 	trace_hardirqs_on_prepare();
-	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
 	user_enter_irqoff();
@@ -416,7 +416,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
 			instrumentation_begin();
 			/* Tell the tracer that IRET will enable interrupts */
 			trace_hardirqs_on_prepare();
-			lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+			lockdep_hardirqs_on_prepare();
 			instrumentation_end();
 			rcu_irq_exit();
 			lockdep_hardirqs_on(CALLER_ADDR0);
@@ -465,7 +465,7 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
 	ftrace_nmi_exit();
 	if (irq_state.lockdep) {
 		trace_hardirqs_on_prepare();
-		lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+		lockdep_hardirqs_on_prepare();
 	}
 	instrumentation_end();
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c06cab6546ed..3cbd49216174 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1380,7 +1380,7 @@ static struct lock_list *alloc_list_entry(void)
  */
 static int add_lock_to_list(struct lock_class *this,
 			    struct lock_class *links_to, struct list_head *head,
-			    unsigned long ip, u16 distance, u8 dep,
+			    u16 distance, u8 dep,
 			    const struct lock_trace *trace)
 {
 	struct lock_list *entry;
@@ -3133,19 +3133,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * to the previous lock's dependency list:
 	 */
 	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
-			       &hlock_class(prev)->locks_after,
-			       next->acquire_ip, distance,
-			       calc_dep(prev, next),
-			       *trace);
+			       &hlock_class(prev)->locks_after, distance,
+			       calc_dep(prev, next), *trace);
 
 	if (!ret)
 		return 0;
 
 	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
-			       &hlock_class(next)->locks_before,
-			       next->acquire_ip, distance,
-			       calc_depb(prev, next),
-			       *trace);
+			       &hlock_class(next)->locks_before, distance,
+			       calc_depb(prev, next), *trace);
 	if (!ret)
 		return 0;
 
@@ -4236,14 +4232,13 @@ static void __trace_hardirqs_on_caller(void)
 
 /**
  * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
- * @ip:		Caller address
  *
  * Invoked before a possible transition to RCU idle from exit to user or
  * guest mode. This ensures that all RCU operations are done before RCU
  * stops watching. After the RCU transition lockdep_hardirqs_on() has to be
  * invoked to set the final state.
  */
-void lockdep_hardirqs_on_prepare(unsigned long ip)
+void lockdep_hardirqs_on_prepare(void)
 {
 	if (unlikely(!debug_locks))
 		return;
@@ -4840,8 +4835,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
 static void
 print_lock_nested_lock_not_held(struct task_struct *curr,
-				struct held_lock *hlock,
-				unsigned long ip)
+				struct held_lock *hlock)
 {
 	if (!debug_locks_off())
 		return;
@@ -5017,7 +5011,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
 
 	if (nest_lock && !__lock_is_held(nest_lock, -1)) {
-		print_lock_nested_lock_not_held(curr, hlock, ip);
+		print_lock_nested_lock_not_held(curr, hlock);
 		return 0;
 	}
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f8b5020e76a..3984dd21a5aa 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -102,7 +102,7 @@ void __cpuidle default_idle_call(void)
 		 * last -- this is very similar to the entry code.
 		 */
 		trace_hardirqs_on_prepare();
-		lockdep_hardirqs_on_prepare(_THIS_IP_);
+		lockdep_hardirqs_on_prepare();
 		rcu_idle_enter();
 		lockdep_hardirqs_on(_THIS_IP_);
 
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f4938040c228..95b58bd757ce 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -46,7 +46,7 @@ void trace_hardirqs_on(void)
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
 
-	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	lockdep_hardirqs_on_prepare();
 	lockdep_hardirqs_on(CALLER_ADDR0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
@@ -94,7 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
 
-	lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	lockdep_hardirqs_on_prepare();
 	lockdep_hardirqs_on(CALLER_ADDR0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
-- 
cgit v1.2.3


From f9e21aa9e6fb11355e54c8949a390d49ca21cde1 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 22 Mar 2022 11:20:57 -0400
Subject: locking/rwsem: No need to check for handoff bit if wait queue empty

Since commit d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling
more consistent"), the handoff bit is always cleared if the wait queue
becomes empty. There is no need to check for RWSEM_FLAG_HANDOFF when
the wait list is known to be empty.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220322152059.2182333-2-longman@redhat.com
---
 kernel/locking/rwsem.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acde5d6f1254..b077b1b2d4b3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -977,12 +977,11 @@ queue:
 	if (list_empty(&sem->wait_list)) {
 		/*
 		 * In case the wait queue is empty and the lock isn't owned
-		 * by a writer or has the handoff bit set, this reader can
-		 * exit the slowpath and return immediately as its
-		 * RWSEM_READER_BIAS has already been set in the count.
+		 * by a writer, this reader can exit the slowpath and return
+		 * immediately as its RWSEM_READER_BIAS has already been set
+		 * in the count.
 		 */
-		if (!(atomic_long_read(&sem->count) &
-		     (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
 			/* Provide lock ACQUIRE */
 			smp_acquire__after_ctrl_dep();
 			raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3


From 54c1ee4d614d52844cf24c46d8415bf1392021d0 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 22 Mar 2022 11:20:58 -0400
Subject: locking/rwsem: Conditionally wake waiters in reader/writer slowpaths

In an analysis of a recent vmcore, a reader-owned rwsem was found with
385 readers but no writer in the wait queue. That is kind of unusual
but it may be caused by some race conditions that we have not fully
understood yet. In such a case, all the readers in the wait queue should
join the other reader-owners and acquire the read lock.

In rwsem_down_write_slowpath(), an incoming writer will try to
wake up the front readers under such circumstance. That is not
the case for rwsem_down_read_slowpath(), add a new helper function
rwsem_cond_wake_waiter() to do wakeup and use it in both reader and
writer slowpaths to have a consistent and correct behavior.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220322152059.2182333-3-longman@redhat.com
---
 kernel/locking/rwsem.c | 68 ++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index b077b1b2d4b3..03cb97a8e4cd 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -901,7 +901,7 @@ done:
  */
 static inline void clear_nonspinnable(struct rw_semaphore *sem)
 {
-	if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
 		atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
 }
 
@@ -925,6 +925,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
 }
 #endif
 
+/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+					  struct wake_q_head *wake_q)
+{
+	enum rwsem_wake_type wake_type;
+
+	if (count & RWSEM_WRITER_MASK)
+		return;
+
+	if (count & RWSEM_READER_MASK) {
+		wake_type = RWSEM_WAKE_READERS;
+	} else {
+		wake_type = RWSEM_WAKE_ANY;
+		clear_nonspinnable(sem);
+	}
+	rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
 /*
  * Wait for the read lock to be granted
  */
@@ -935,7 +960,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 	long rcnt = (count >> RWSEM_READER_SHIFT);
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
-	bool wake = false;
 
 	/*
 	 * To prevent a constant stream of readers from starving a sleeping
@@ -996,22 +1020,11 @@ queue:
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = atomic_long_add_return(adjustment, &sem->count);
 
-	/*
-	 * If there are no active locks, wake the front queued process(es).
-	 *
-	 * If there are no writers and we are first in the queue,
-	 * wake our own waiter to join the existing active readers !
-	 */
-	if (!(count & RWSEM_LOCK_MASK)) {
-		clear_nonspinnable(sem);
-		wake = true;
-	}
-	if (wake || (!(count & RWSEM_WRITER_MASK) &&
-		    (adjustment & RWSEM_FLAG_WAITERS)))
-		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
+	rwsem_cond_wake_waiter(sem, count, &wake_q);
 	raw_spin_unlock_irq(&sem->wait_lock);
-	wake_up_q(&wake_q);
+
+	if (!wake_q_empty(&wake_q))
+		wake_up_q(&wake_q);
 
 	/* wait to be given the lock */
 	for (;;) {
@@ -1050,7 +1063,6 @@ out_nolock:
 static struct rw_semaphore __sched *
 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
-	long count;
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
 
@@ -1074,23 +1086,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 
 	/* we're now waiting on the lock */
 	if (rwsem_first_waiter(sem) != &waiter) {
-		count = atomic_long_read(&sem->count);
-
-		/*
-		 * If there were already threads queued before us and:
-		 *  1) there are no active locks, wake the front
-		 *     queued process(es) as the handoff bit might be set.
-		 *  2) there are no active writers and some readers, the lock
-		 *     must be read owned; so we try to wake any read lock
-		 *     waiters that were queued ahead of us.
-		 */
-		if (count & RWSEM_WRITER_MASK)
-			goto wait;
-
-		rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
-					? RWSEM_WAKE_READERS
-					: RWSEM_WAKE_ANY, &wake_q);
-
+		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+				       &wake_q);
 		if (!wake_q_empty(&wake_q)) {
 			/*
 			 * We want to minimize wait_lock hold time especially
@@ -1105,7 +1102,6 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
 	}
 
-wait:
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	for (;;) {
-- 
cgit v1.2.3


From 1ee326196c66583006b0c95356a4b7dc51bf3531 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 22 Mar 2022 11:20:59 -0400
Subject: locking/rwsem: Always try to wake waiters in out_nolock path

For writers, the out_nolock path will always attempt to wake up waiters.
This may not be really necessary if the waiter to be removed is not the
first one.

For readers, no attempt to wake up waiter is being made. However, if
the HANDOFF bit is set and the reader to be removed is the first waiter,
the waiter behind it will inherit the HANDOFF bit and for a write lock
waiter waking it up will allow it to spin on the lock to acquire it
faster. So it can be beneficial to do a wakeup in this case.

Add a new rwsem_del_wake_waiter() helper function to do that consistently
for both reader and writer out_nolock paths.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220322152059.2182333-4-longman@redhat.com
---
 kernel/locking/rwsem.c | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 03cb97a8e4cd..16b532bb5b92 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -375,16 +375,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
  *
  * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
  * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
  */
-static inline void
+static inline bool
 rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
 {
 	lockdep_assert_held(&sem->wait_lock);
 	list_del(&waiter->list);
 	if (likely(!list_empty(&sem->wait_list)))
-		return;
+		return true;
 
 	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+	return false;
 }
 
 /*
@@ -558,6 +561,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
 	}
 }
 
+/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+		      struct wake_q_head *wake_q)
+		      __releases(&sem->wait_lock)
+{
+	bool first = rwsem_first_waiter(sem) == waiter;
+
+	wake_q_init(wake_q);
+
+	/*
+	 * If the wait_list isn't empty and the waiter to be deleted is
+	 * the first waiter, we wake up the remaining waiters as they may
+	 * be eligible to acquire or spin on the lock.
+	 */
+	if (rwsem_del_waiter(sem, waiter) && first)
+		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+	raw_spin_unlock_irq(&sem->wait_lock);
+	if (!wake_q_empty(wake_q))
+		wake_up_q(wake_q);
+}
+
 /*
  * This function must be called with the sem->wait_lock held to prevent
  * race conditions between checking the rwsem wait list and setting the
@@ -1050,8 +1080,7 @@ queue:
 	return sem;
 
 out_nolock:
-	rwsem_del_waiter(sem, &waiter);
-	raw_spin_unlock_irq(&sem->wait_lock);
+	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock_fail);
 	return ERR_PTR(-EINTR);
@@ -1095,7 +1124,6 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 			 */
 			raw_spin_unlock_irq(&sem->wait_lock);
 			wake_up_q(&wake_q);
-			wake_q_init(&wake_q);	/* Used again, reinit */
 			raw_spin_lock_irq(&sem->wait_lock);
 		}
 	} else {
@@ -1148,11 +1176,7 @@ trylock_again:
 out_nolock:
 	__set_current_state(TASK_RUNNING);
 	raw_spin_lock_irq(&sem->wait_lock);
-	rwsem_del_waiter(sem, &waiter);
-	if (!list_empty(&sem->wait_list))
-		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-	raw_spin_unlock_irq(&sem->wait_lock);
-	wake_up_q(&wake_q);
+	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
 	lockevent_inc(rwsem_wlock_fail);
 	return ERR_PTR(-EINTR);
 }
-- 
cgit v1.2.3


From 16edd9b511a13e7760ed4b92ba4e39bacda5c86f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 22 Mar 2022 11:57:08 -0700
Subject: locking: Add lock contention tracepoints

This adds two new lock contention tracepoints like below:

 * lock:contention_begin
 * lock:contention_end

The lock:contention_begin takes a flags argument to classify locks.  I
found it useful to identify what kind of locks it's tracing like if
it's spinning or sleeping, reader-writer lock, real-time, and per-cpu.

Move tracepoint definitions into mutex.c so that we can use them
without lockdep.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Link: https://lkml.kernel.org/r/20220322185709.141236-2-namhyung@kernel.org
---
 kernel/locking/lockdep.c | 1 -
 kernel/locking/mutex.c   | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3cbd49216174..a6e671b8608d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -60,7 +60,6 @@
 
 #include "lockdep_internals.h"
 
-#define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 5e3585950ec8..ee2fd7614a93 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -30,6 +30,9 @@
 #include <linux/debug_locks.h>
 #include <linux/osq_lock.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
 #ifndef CONFIG_PREEMPT_RT
 #include "mutex.h"
 
-- 
cgit v1.2.3


From ee042be16cb455116d0fe99b77c6bc8baf87c8c6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 22 Mar 2022 11:57:09 -0700
Subject: locking: Apply contention tracepoints in the slow path

Adding the lock contention tracepoints in various lock function slow
paths.  Note that each arch can define spinlock differently, I only
added it only to the generic qspinlock for now.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Link: https://lkml.kernel.org/r/20220322185709.141236-3-namhyung@kernel.org
---
 kernel/locking/mutex.c        |  3 +++
 kernel/locking/percpu-rwsem.c |  5 +++++
 kernel/locking/qrwlock.c      |  9 +++++++++
 kernel/locking/qspinlock.c    |  5 +++++
 kernel/locking/rtmutex.c      | 11 +++++++++++
 kernel/locking/rwbase_rt.c    |  7 +++++++
 kernel/locking/rwsem.c        |  9 +++++++++
 kernel/locking/semaphore.c    | 15 ++++++++++++++-
 8 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ee2fd7614a93..c88deda77cf2 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -644,6 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	}
 
 	set_current_state(state);
+	trace_contention_begin(lock, 0);
 	for (;;) {
 		bool first;
 
@@ -710,6 +711,7 @@ acquired:
 skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
+	trace_contention_end(lock, 0);
 
 	if (ww_ctx)
 		ww_mutex_lock_acquired(ww, ww_ctx);
@@ -721,6 +723,7 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	__mutex_remove_waiter(lock, &waiter);
+	trace_contention_end(lock, ret);
 err_early_kill:
 	raw_spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index c9fdae94e098..5fe4c5495ba3 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/debug.h>
 #include <linux/errno.h>
+#include <trace/events/lock.h>
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *key)
@@ -171,9 +172,11 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
 	if (try)
 		return false;
 
+	trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
 	preempt_enable();
 	percpu_rwsem_wait(sem, /* .reader = */ true);
 	preempt_disable();
+	trace_contention_end(sem, 0);
 
 	return true;
 }
@@ -216,6 +219,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+	trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
 
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
@@ -237,6 +241,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
 
 	/* Wait for all active readers to complete. */
 	rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+	trace_contention_end(sem, 0);
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73f4733..7f42e52a648f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
+#include <trace/events/lock.h>
 
 /**
  * queued_read_lock_slowpath - acquire read lock of a queue rwlock
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
 	}
 	atomic_sub(_QR_BIAS, &lock->cnts);
 
+	trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
 	/*
 	 * Put the reader into the wait queue
 	 */
@@ -51,6 +54,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
 	 * Signal the next one in queue to become queue head
 	 */
 	arch_spin_unlock(&lock->wait_lock);
+
+	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_read_lock_slowpath);
 
@@ -62,6 +67,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 {
 	int cnts;
 
+	trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
 	/* Put the writer into the wait queue */
 	arch_spin_lock(&lock->wait_lock);
 
@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
 unlock:
 	arch_spin_unlock(&lock->wait_lock);
+
+	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..65a9a10caa6f 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 #include <asm/byteorder.h>
 #include <asm/qspinlock.h>
+#include <trace/events/lock.h>
 
 /*
  * Include queued spinlock statistics code
@@ -401,6 +402,8 @@ pv_queue:
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
 
+	trace_contention_begin(lock, LCB_F_SPIN);
+
 	/*
 	 * 4 nodes are allocated based on the assumption that there will
 	 * not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
 	pv_kick_node(lock, next);
 
 release:
+	trace_contention_end(lock, 0);
+
 	/*
 	 * release the node
 	 */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8555c4efe97c..7779ee8abc2a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,6 +24,8 @@
 #include <linux/sched/wake_q.h>
 #include <linux/ww_mutex.h>
 
+#include <trace/events/lock.h>
+
 #include "rtmutex_common.h"
 
 #ifndef WW_RT
@@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 
 	set_current_state(state);
 
+	trace_contention_begin(lock, LCB_F_RT);
+
 	ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
 	if (likely(!ret))
 		ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
@@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
 	 * unconditionally. We might have to fix that up.
 	 */
 	fixup_rt_mutex_waiters(lock);
+
+	trace_contention_end(lock, ret);
+
 	return ret;
 }
 
@@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
 	/* Save current state and set state to TASK_RTLOCK_WAIT */
 	current_save_and_set_rtlock_wait_state();
 
+	trace_contention_begin(lock, LCB_F_RT);
+
 	task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
 
 	for (;;) {
@@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
 	 */
 	fixup_rt_mutex_waiters(lock);
 	debug_rt_mutex_free_waiter(&waiter);
+
+	trace_contention_end(lock, 0);
 }
 
 static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 6fd3162e4098..c201aadb9301 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	 * Reader2 to call up_read(), which might be unbound.
 	 */
 
+	trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
 	/*
 	 * For rwlocks this returns 0 unconditionally, so the below
 	 * !ret conditionals are optimized out.
@@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	raw_spin_unlock_irq(&rtm->wait_lock);
 	if (!ret)
 		rwbase_rtmutex_unlock(rtm);
+
+	trace_contention_end(rwb, ret);
 	return ret;
 }
 
@@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 		goto out_unlock;
 
 	rwbase_set_and_save_current_state(state);
+	trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
 	for (;;) {
 		/* Optimized out for rwlocks */
 		if (rwbase_signal_pending_state(state, current)) {
 			rwbase_restore_current_state();
 			__rwbase_write_unlock(rwb, 0, flags);
+			trace_contention_end(rwb, -EINTR);
 			return -EINTR;
 		}
 
@@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 		set_current_state(state);
 	}
 	rwbase_restore_current_state();
+	trace_contention_end(rwb, 0);
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 16b532bb5b92..9d1db4a54d34 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
+#include <trace/events/lock.h>
 
 #ifndef CONFIG_PREEMPT_RT
 #include "lock_events.h"
@@ -1056,6 +1057,8 @@ queue:
 	if (!wake_q_empty(&wake_q))
 		wake_up_q(&wake_q);
 
+	trace_contention_begin(sem, LCB_F_READ);
+
 	/* wait to be given the lock */
 	for (;;) {
 		set_current_state(state);
@@ -1077,12 +1080,14 @@ queue:
 
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock);
+	trace_contention_end(sem, 0);
 	return sem;
 
 out_nolock:
 	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock_fail);
+	trace_contention_end(sem, -EINTR);
 	return ERR_PTR(-EINTR);
 }
 
@@ -1132,6 +1137,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
+	trace_contention_begin(sem, LCB_F_WRITE);
+
 	for (;;) {
 		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1171,6 +1178,7 @@ trylock_again:
 	__set_current_state(TASK_RUNNING);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);
+	trace_contention_end(sem, 0);
 	return sem;
 
 out_nolock:
@@ -1178,6 +1186,7 @@ out_nolock:
 	raw_spin_lock_irq(&sem->wait_lock);
 	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
 	lockevent_inc(rwsem_wlock_fail);
+	trace_contention_end(sem, -EINTR);
 	return ERR_PTR(-EINTR);
 }
 
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..f2654d2fe43a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 #include <linux/ftrace.h>
+#include <trace/events/lock.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
@@ -205,7 +206,7 @@ struct semaphore_waiter {
  * constant, and thus optimised away by the compiler.  Likewise the
  * 'timeout' parameter for the cases without timeouts.
  */
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
 								long timeout)
 {
 	struct semaphore_waiter waiter;
@@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 	return -EINTR;
 }
 
+static inline int __sched __down_common(struct semaphore *sem, long state,
+					long timeout)
+{
+	int ret;
+
+	trace_contention_begin(sem, 0);
+	ret = ___down_common(sem, state, timeout);
+	trace_contention_end(sem, ret);
+
+	return ret;
+}
+
 static noinline void __sched __down(struct semaphore *sem)
 {
 	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-- 
cgit v1.2.3


From dc1f7893a70fe403983bd8492f177bf993940e2c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Mar 2022 13:06:54 +0200
Subject: locking/mutex: Make contention tracepoints more consistent wrt
 adaptive spinning

Have the trace_contention_*() tracepoints consistently include
adaptive spinning. In order to differentiate between the spinning and
non-spinning states add LCB_F_MUTEX and combine with LCB_F_SPIN.

The consequence is that a mutex contention can now triggler multiple
_begin() tracepoints before triggering an _end().

Additionally, this fixes one path where mutex would trigger _end()
without ever seeing a _begin().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/mutex.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c88deda77cf2..d973fe6041bf 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -602,12 +602,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
+	trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
 	if (__mutex_trylock(lock) ||
 	    mutex_optimistic_spin(lock, ww_ctx, NULL)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
+		trace_contention_end(lock, 0);
 		preempt_enable();
 		return 0;
 	}
@@ -644,7 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	}
 
 	set_current_state(state);
-	trace_contention_begin(lock, 0);
+	trace_contention_begin(lock, LCB_F_MUTEX);
 	for (;;) {
 		bool first;
 
@@ -684,10 +686,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if (__mutex_trylock_or_handoff(lock, first) ||
-		    (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+		if (__mutex_trylock_or_handoff(lock, first))
 			break;
 
+		if (first) {
+			trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+				break;
+			trace_contention_begin(lock, LCB_F_MUTEX);
+		}
+
 		raw_spin_lock(&lock->wait_lock);
 	}
 	raw_spin_lock(&lock->wait_lock);
@@ -723,8 +731,8 @@ skip_wait:
 err:
 	__set_current_state(TASK_RUNNING);
 	__mutex_remove_waiter(lock, &waiter);
-	trace_contention_end(lock, ret);
 err_early_kill:
+	trace_contention_end(lock, ret);
 	raw_spin_unlock(&lock->wait_lock);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, ip);
-- 
cgit v1.2.3


From 5693fa74f98afed5421ac0165e9e9291bde7d9e1 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 12 Nov 2021 18:52:02 +0000
Subject: kcsan: Use preemption model accessors

Per PREEMPT_DYNAMIC, checking CONFIG_PREEMPT doesn't tell you the actual
preemption model of the live kernel. Use the newly-introduced accessors
instead.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20211112185203.280040-4-valentin.schneider@arm.com
---
 kernel/kcsan/kcsan_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a36fca063a73..767dfacd6ed3 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
 	else
 		nthreads *= 2;
 
-	if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+	if (!preempt_model_preemptible() ||
+	    !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
 		/*
 		 * Without any preemption, keep 2 CPUs free for other tasks, one
 		 * of which is the main test case function checking for
 		 * completion or failure.
 		 */
-		const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+		const long min_unused_cpus = preempt_model_none() ? 2 : 0;
 		const long min_required_cpus = 2 + min_unused_cpus;
 
 		if (num_online_cpus() < min_required_cpus) {
-- 
cgit v1.2.3


From cfe43f478b79ba45573ca22d52d0d8823be068fa Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 12 Nov 2021 18:52:01 +0000
Subject: preempt/dynamic: Introduce preemption model accessors

CONFIG_PREEMPT{_NONE, _VOLUNTARY} designate either:
o The build-time preemption model when !PREEMPT_DYNAMIC
o The default boot-time preemption model when PREEMPT_DYNAMIC

IOW, using those on PREEMPT_DYNAMIC kernels is meaningless - the actual
model could have been set to something else by the "preempt=foo" cmdline
parameter. Same problem applies to CONFIG_PREEMPTION.

Introduce a set of helpers to determine the actual preemption model used by
the live kernel.

Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20211112185203.280040-3-valentin.schneider@arm.com
---
 kernel/sched/core.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..068c088e9584 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8409,6 +8409,18 @@ static void __init preempt_dynamic_init(void)
 	}
 }
 
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+	bool preempt_model_##mode(void)						 \
+	{									 \
+		WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+		return preempt_dynamic_mode == preempt_dynamic_##mode;		 \
+	}									 \
+	EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
 #else /* !CONFIG_PREEMPT_DYNAMIC */
 
 static inline void preempt_dynamic_init(void) { }
-- 
cgit v1.2.3


From 089c02ae2771a14af2928c59c56abfb9b885a8d7 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Fri, 12 Nov 2021 18:52:03 +0000
Subject: ftrace: Use preemption model accessors for trace header printout

Per PREEMPT_DYNAMIC, checking CONFIG_PREEMPT doesn't tell you the actual
preemption model of the live kernel. Use the newly-introduced accessors
instead.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20211112185203.280040-5-valentin.schneider@arm.com
---
 kernel/trace/trace.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4de111fa18f..124f1897fd56 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   entries,
 		   total,
 		   buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
-		   "server",
-#elif defined(CONFIG_PREEMPT_VOLUNTARY)
-		   "desktop",
-#elif defined(CONFIG_PREEMPT)
-		   "preempt",
-#elif defined(CONFIG_PREEMPT_RT)
-		   "preempt_rt",
-#else
+		   preempt_model_none()      ? "server" :
+		   preempt_model_voluntary() ? "desktop" :
+		   preempt_model_full()      ? "preempt" :
+		   preempt_model_rt()        ? "preempt_rt" :
 		   "unknown",
-#endif
 		   /* These are reserved for later use */
 		   0, 0, 0, 0);
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 8ab4ed08a24f88359f22439e37cac65c95cf6ac2 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:32 +0000
Subject: module: Simple refactor in preparation for split

No functional change.

This patch makes it possible to move non-essential code
out of core module code.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h | 21 +++++++++++++++++++++
 kernel/module/main.c     | 22 ++--------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 8c381c99062f..ea8c4c02614c 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -7,6 +7,27 @@
 
 #include <linux/elf.h>
 #include <asm/module.h>
+#include <linux/mutex.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1))
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const s32 __start___kcrctab[];
+extern const s32 __start___kcrctab_gpl[];
 
 struct load_info {
 	const char *name;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 1a17f02f69a0..5898a1af41a9 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -63,10 +63,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
 /*
  * Modules' sections will be aligned on page boundaries
  * to ensure complete separation of code and data, but
@@ -78,9 +74,6 @@
 # define debug_align(X) (X)
 #endif
 
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
 /*
  * Mutex protects:
  * 1) List of modules (also safely readable with preempt_disable),
@@ -88,8 +81,8 @@
  * 3) module_addr_min/module_addr_max.
  * (delete and add uses RCU list operations).
  */
-static DEFINE_MUTEX(module_mutex);
-static LIST_HEAD(modules);
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
 
 /* Work queue for freeing init sections in success case */
 static void do_free_init(struct work_struct *w);
@@ -408,14 +401,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
 	return (void *)info->sechdrs[sec].sh_addr;
 }
 
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
-
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
 #else
@@ -4542,9 +4527,6 @@ static void cfi_cleanup(struct module *mod)
 #endif
 }
 
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
 /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
 static char *module_flags(struct module *mod, char *buf)
 {
-- 
cgit v1.2.3


From 5aff4dfdb4ae2741cfff759d917f597f2c7f70aa Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:33 +0000
Subject: module: Make internal.h and decompress.c more compliant

This patch will address the following warning and style violations
generated by ./scripts/checkpatch.pl in strict mode:

  WARNING: Use #include <linux/module.h> instead of <asm/module.h>
  #10: FILE: kernel/module/internal.h:10:
  +#include <asm/module.h>

  CHECK: spaces preferred around that '-' (ctx:VxV)
  #18: FILE: kernel/module/internal.h:18:
  +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))

  CHECK: Please use a blank line after function/struct/union/enum declarations
  #69: FILE: kernel/module/internal.h:69:
  +}
  +static inline void module_decompress_cleanup(struct load_info *info)
						   ^
  CHECK: extern prototypes should be avoided in .h files
  #84: FILE: kernel/module/internal.h:84:
  +extern int mod_verify_sig(const void *mod, struct load_info *info);

  WARNING: Missing a blank line after declarations
  #116: FILE: kernel/module/decompress.c:116:
  +               struct page *page = module_get_next_page(info);
  +               if (!page) {

  WARNING: Missing a blank line after declarations
  #174: FILE: kernel/module/decompress.c:174:
  +               struct page *page = module_get_next_page(info);
  +               if (!page) {

  CHECK: Please use a blank line after function/struct/union/enum declarations
  #258: FILE: kernel/module/decompress.c:258:
  +}
  +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);

Note: Fortunately, the multiple-include optimisation found in
include/linux/module.h will prevent duplication/or inclusion more than
once.

Fixes: f314dfea16a0 ("modsign: log module name in the event of an error")
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/decompress.c | 3 +++
 kernel/module/internal.h   | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index d14d6443225a..2fc7081dd7c1 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -113,6 +113,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
 
 	do {
 		struct page *page = module_get_next_page(info);
+
 		if (!page) {
 			retval = -ENOMEM;
 			goto out_inflate_end;
@@ -171,6 +172,7 @@ static ssize_t module_xz_decompress(struct load_info *info,
 
 	do {
 		struct page *page = module_get_next_page(info);
+
 		if (!page) {
 			retval = -ENOMEM;
 			goto out;
@@ -256,6 +258,7 @@ static ssize_t compression_show(struct kobject *kobj,
 {
 	return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION));
 }
+
 static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
 
 static int __init module_decompress_sysfs_init(void)
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index ea8c4c02614c..e0775e66bcf7 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -6,7 +6,8 @@
  */
 
 #include <linux/elf.h>
-#include <asm/module.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
 
 #ifndef ARCH_SHF_SMALL
@@ -54,7 +55,7 @@ struct load_info {
 	} index;
 };
 
-extern int mod_verify_sig(const void *mod, struct load_info *info);
+int mod_verify_sig(const void *mod, struct load_info *info);
 
 #ifdef CONFIG_MODULE_DECOMPRESS
 int module_decompress(struct load_info *info, const void *buf, size_t size);
@@ -65,6 +66,7 @@ static inline int module_decompress(struct load_info *info,
 {
 	return -EOPNOTSUPP;
 }
+
 static inline void module_decompress_cleanup(struct load_info *info)
 {
 }
-- 
cgit v1.2.3


From 1be9473e31ab87ad1b6ecf9fd11df461930ede85 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:34 +0000
Subject: module: Move livepatch support to a separate file

No functional change.

This patch migrates livepatch support (i.e. used during module
add/or load and remove/or deletion) from core module code into
kernel/module/livepatch.c. At the moment it contains code to
persist Elf information about a given livepatch module, only.
The new file was added to MAINTAINERS.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile    |   1 +
 kernel/module/internal.h  |  22 ++++++++++
 kernel/module/livepatch.c |  74 +++++++++++++++++++++++++++++++++
 kernel/module/main.c      | 102 +++++-----------------------------------------
 4 files changed, 107 insertions(+), 92 deletions(-)
 create mode 100644 kernel/module/livepatch.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index cdd5c61b8c7f..ed3aacb04f17 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -10,3 +10,4 @@ KCOV_INSTRUMENT_module.o := n
 obj-y += main.o
 obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
 obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index e0775e66bcf7..ad7a444253ed 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -57,6 +57,28 @@ struct load_info {
 
 int mod_verify_sig(const void *mod, struct load_info *info);
 
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+	mod->klp = true;
+	return true;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_MODULE_DECOMPRESS
 int module_decompress(struct load_info *info, const void *buf, size_t size);
 void module_decompress_cleanup(struct load_info *info);
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..486d4ff92719
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	unsigned int size, symndx;
+	int ret;
+
+	size = sizeof(*mod->klp_info);
+	mod->klp_info = kmalloc(size, GFP_KERNEL);
+	if (!mod->klp_info)
+		return -ENOMEM;
+
+	/* Elf header */
+	size = sizeof(mod->klp_info->hdr);
+	memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+	/* Elf section header table */
+	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+	if (!mod->klp_info->sechdrs) {
+		ret = -ENOMEM;
+		goto free_info;
+	}
+
+	/* Elf section name string table */
+	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+	if (!mod->klp_info->secstrings) {
+		ret = -ENOMEM;
+		goto free_sechdrs;
+	}
+
+	/* Elf symbol section index */
+	symndx = info->index.sym;
+	mod->klp_info->symndx = symndx;
+
+	/*
+	 * For livepatch modules, core_kallsyms.symtab is a complete
+	 * copy of the original symbol table. Adjust sh_addr to point
+	 * to core_kallsyms.symtab since the copy of the symtab in module
+	 * init memory is freed at the end of do_init_module().
+	 */
+	mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+	return 0;
+
+free_sechdrs:
+	kfree(mod->klp_info->sechdrs);
+free_info:
+	kfree(mod->klp_info);
+	return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+	kfree(mod->klp_info->sechdrs);
+	kfree(mod->klp_info->secstrings);
+	kfree(mod->klp_info);
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5898a1af41a9..915143827069 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2043,81 +2043,6 @@ static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 }
 #endif /*  CONFIG_STRICT_MODULE_RWX */
 
-#ifdef CONFIG_LIVEPATCH
-/*
- * Persist Elf information about a module. Copy the Elf header,
- * section header table, section string table, and symtab section
- * index from info to mod->klp_info.
- */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
-	unsigned int size, symndx;
-	int ret;
-
-	size = sizeof(*mod->klp_info);
-	mod->klp_info = kmalloc(size, GFP_KERNEL);
-	if (mod->klp_info == NULL)
-		return -ENOMEM;
-
-	/* Elf header */
-	size = sizeof(mod->klp_info->hdr);
-	memcpy(&mod->klp_info->hdr, info->hdr, size);
-
-	/* Elf section header table */
-	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
-	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
-	if (mod->klp_info->sechdrs == NULL) {
-		ret = -ENOMEM;
-		goto free_info;
-	}
-
-	/* Elf section name string table */
-	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
-	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
-	if (mod->klp_info->secstrings == NULL) {
-		ret = -ENOMEM;
-		goto free_sechdrs;
-	}
-
-	/* Elf symbol section index */
-	symndx = info->index.sym;
-	mod->klp_info->symndx = symndx;
-
-	/*
-	 * For livepatch modules, core_kallsyms.symtab is a complete
-	 * copy of the original symbol table. Adjust sh_addr to point
-	 * to core_kallsyms.symtab since the copy of the symtab in module
-	 * init memory is freed at the end of do_init_module().
-	 */
-	mod->klp_info->sechdrs[symndx].sh_addr = \
-		(unsigned long) mod->core_kallsyms.symtab;
-
-	return 0;
-
-free_sechdrs:
-	kfree(mod->klp_info->sechdrs);
-free_info:
-	kfree(mod->klp_info);
-	return ret;
-}
-
-static void free_module_elf(struct module *mod)
-{
-	kfree(mod->klp_info->sechdrs);
-	kfree(mod->klp_info->secstrings);
-	kfree(mod->klp_info);
-}
-#else /* !CONFIG_LIVEPATCH */
-static int copy_module_elf(struct module *mod, struct load_info *info)
-{
-	return 0;
-}
-
-static void free_module_elf(struct module *mod)
-{
-}
-#endif /* CONFIG_LIVEPATCH */
-
 void __weak module_memfree(void *module_region)
 {
 	/*
@@ -3092,30 +3017,23 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
 	return 0;
 }
 
-#ifdef CONFIG_LIVEPATCH
 static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
 {
-	if (get_modinfo(info, "livepatch")) {
-		mod->klp = true;
+	if (!get_modinfo(info, "livepatch"))
+		/* Nothing more to do */
+		return 0;
+
+	if (set_livepatch_module(mod)) {
 		add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
 		pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
-			       mod->name);
-	}
-
-	return 0;
-}
-#else /* !CONFIG_LIVEPATCH */
-static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
-{
-	if (get_modinfo(info, "livepatch")) {
-		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
-		       mod->name);
-		return -ENOEXEC;
+				mod->name);
+		return 0;
 	}
 
-	return 0;
+	pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+	       mod->name);
+	return -ENOEXEC;
 }
-#endif /* CONFIG_LIVEPATCH */
 
 static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
 {
-- 
cgit v1.2.3


From 58d208de3e8d87dbe196caf0b57cc58c7a3836ca Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:35 +0000
Subject: module: Move latched RB-tree support to a separate file

No functional change.

This patch migrates module latched RB-tree support
(e.g. see __module_address()) from core module code
into kernel/module/tree_lookup.c.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile      |   1 +
 kernel/module/internal.h    |  33 +++++++++++
 kernel/module/main.c        | 130 ++------------------------------------------
 kernel/module/tree_lookup.c | 109 +++++++++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 126 deletions(-)
 create mode 100644 kernel/module/tree_lookup.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index ed3aacb04f17..88774e386276 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -11,3 +11,4 @@ obj-y += main.o
 obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
 obj-$(CONFIG_MODULE_SIG) += signing.o
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index ad7a444253ed..f1682e3677be 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -9,6 +9,7 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/rculist.h>
 
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
@@ -93,3 +94,35 @@ static inline void module_decompress_cleanup(struct load_info *info)
 {
 }
 #endif
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+struct mod_tree_root {
+	struct latch_tree_root root;
+	unsigned long addr_min;
+	unsigned long addr_max;
+};
+
+extern struct mod_tree_root mod_tree;
+
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr)
+{
+	struct module *mod;
+
+	list_for_each_entry_rcu(mod, &modules, list,
+				lockdep_is_held(&module_mutex)) {
+		if (within_module(addr, mod))
+			return mod;
+	}
+
+	return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 915143827069..0d0fdd82359b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -90,138 +90,16 @@ static DECLARE_WORK(init_free_wq, do_free_init);
 static LLIST_HEAD(init_free_list);
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
-
-/*
- * Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
- *
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
- */
-
-static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
-{
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
-	return (unsigned long)layout->base;
-}
-
-static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
-{
-	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
-
-	return (unsigned long)layout->size;
-}
-
-static __always_inline bool
-mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
-{
-	return __mod_tree_val(a) < __mod_tree_val(b);
-}
-
-static __always_inline int
-mod_tree_comp(void *key, struct latch_tree_node *n)
-{
-	unsigned long val = (unsigned long)key;
-	unsigned long start, end;
-
-	start = __mod_tree_val(n);
-	if (val < start)
-		return -1;
-
-	end = start + __mod_tree_size(n);
-	if (val >= end)
-		return 1;
-
-	return 0;
-}
-
-static const struct latch_tree_ops mod_tree_ops = {
-	.less = mod_tree_less,
-	.comp = mod_tree_comp,
-};
-
-static struct mod_tree_root {
-	struct latch_tree_root root;
-	unsigned long addr_min;
-	unsigned long addr_max;
-} mod_tree __cacheline_aligned = {
+struct mod_tree_root mod_tree __cacheline_aligned = {
 	.addr_min = -1UL,
 };
 
 #define module_addr_min mod_tree.addr_min
 #define module_addr_max mod_tree.addr_max
 
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
-{
-	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-static void __mod_tree_remove(struct mod_tree_node *node)
-{
-	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
-}
-
-/*
- * These modifications: insert, remove_init and remove; are serialized by the
- * module_mutex.
- */
-static void mod_tree_insert(struct module *mod)
-{
-	mod->core_layout.mtn.mod = mod;
-	mod->init_layout.mtn.mod = mod;
-
-	__mod_tree_insert(&mod->core_layout.mtn);
-	if (mod->init_layout.size)
-		__mod_tree_insert(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove_init(struct module *mod)
-{
-	if (mod->init_layout.size)
-		__mod_tree_remove(&mod->init_layout.mtn);
-}
-
-static void mod_tree_remove(struct module *mod)
-{
-	__mod_tree_remove(&mod->core_layout.mtn);
-	mod_tree_remove_init(mod);
-}
-
-static struct module *mod_find(unsigned long addr)
-{
-	struct latch_tree_node *ltn;
-
-	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
-	if (!ltn)
-		return NULL;
-
-	return container_of(ltn, struct mod_tree_node, node)->mod;
-}
-
-#else /* MODULES_TREE_LOOKUP */
-
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-static void mod_tree_insert(struct module *mod) { }
-static void mod_tree_remove_init(struct module *mod) { }
-static void mod_tree_remove(struct module *mod) { }
-
-static struct module *mod_find(unsigned long addr)
-{
-	struct module *mod;
-
-	list_for_each_entry_rcu(mod, &modules, list,
-				lockdep_is_held(&module_mutex)) {
-		if (within_module(addr, mod))
-			return mod;
-	}
-
-	return NULL;
-}
-
-#endif /* MODULES_TREE_LOOKUP */
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+static unsigned long module_addr_min = -1UL, module_addr_max;
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
 
 /*
  * Bounds of module text, for speeding up __module_address.
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..0bc4ec3b22ce
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+	return (unsigned long)layout->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+
+	return (unsigned long)layout->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+	return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long start, end;
+
+	start = __mod_tree_val(n);
+	if (val < start)
+		return -1;
+
+	end = start + __mod_tree_size(n);
+	if (val >= end)
+		return 1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+	.less = mod_tree_less,
+	.comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+	mod->core_layout.mtn.mod = mod;
+	mod->init_layout.mtn.mod = mod;
+
+	__mod_tree_insert(&mod->core_layout.mtn);
+	if (mod->init_layout.size)
+		__mod_tree_insert(&mod->init_layout.mtn);
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+	if (mod->init_layout.size)
+		__mod_tree_remove(&mod->init_layout.mtn);
+}
+
+void mod_tree_remove(struct module *mod)
+{
+	__mod_tree_remove(&mod->core_layout.mtn);
+	mod_tree_remove_init(mod);
+}
+
+struct module *mod_find(unsigned long addr)
+{
+	struct latch_tree_node *ltn;
+
+	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+	if (!ltn)
+		return NULL;
+
+	return container_of(ltn, struct mod_tree_node, node)->mod;
+}
-- 
cgit v1.2.3


From b33465fe9c52a3719f013deeca261bd82af235ee Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:36 +0000
Subject: module: Move strict rwx support to a separate file

No functional change.

This patch migrates code that makes module text
and rodata memory read-only and non-text memory
non-executable from core module code into
kernel/module/strict_rwx.c.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile     |  1 +
 kernel/module/internal.h   | 32 +++++++++++++++
 kernel/module/main.c       | 99 +---------------------------------------------
 kernel/module/strict_rwx.c | 85 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 120 insertions(+), 97 deletions(-)
 create mode 100644 kernel/module/strict_rwx.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 88774e386276..d313c8472cb3 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -12,3 +12,4 @@ obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
 obj-$(CONFIG_MODULE_SIG) += signing.o
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index f1682e3677be..a6895bb5598a 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -20,6 +20,17 @@
 /* Maximum number of characters written by module_flags() */
 #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
 
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
+ */
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+# define debug_align(X) PAGE_ALIGN(X)
+#else
+# define debug_align(X) (X)
+#endif
+
 extern struct mutex module_mutex;
 extern struct list_head modules;
 
@@ -126,3 +137,24 @@ static inline struct module *mod_find(unsigned long addr)
 	return NULL;
 }
 #endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start,
+								     int num_pages));
+#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+
+#ifdef CONFIG_STRICT_MODULE_RWX
+void module_enable_ro(const struct module *mod, bool after_init);
+void module_enable_nx(const struct module *mod);
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+				char *secstrings, struct module *mod);
+
+#else /* !CONFIG_STRICT_MODULE_RWX */
+static inline void module_enable_nx(const struct module *mod) { }
+static inline void module_enable_ro(const struct module *mod, bool after_init) {}
+static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+				       char *secstrings, struct module *mod)
+{
+	return 0;
+}
+#endif /* CONFIG_STRICT_MODULE_RWX */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 0d0fdd82359b..d55a2a8338a1 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -63,17 +63,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
- */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
 /*
  * Mutex protects:
  * 1) List of modules (also safely readable with preempt_disable),
@@ -1819,8 +1808,8 @@ static void mod_sysfs_teardown(struct module *mod)
  * whether we are strict.
  */
 #ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
-static void frob_text(const struct module_layout *layout,
-		      int (*set_memory)(unsigned long start, int num_pages))
+void frob_text(const struct module_layout *layout,
+	       int (*set_memory)(unsigned long start, int num_pages))
 {
 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
 	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
@@ -1837,90 +1826,6 @@ static void module_enable_x(const struct module *mod)
 static void module_enable_x(const struct module *mod) { }
 #endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
 
-#ifdef CONFIG_STRICT_MODULE_RWX
-static void frob_rodata(const struct module_layout *layout,
-			int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->text_size,
-		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
-				int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
-			       int (*set_memory)(unsigned long start, int num_pages))
-{
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
-		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_ro(const struct module *mod, bool after_init)
-{
-	if (!rodata_enabled)
-		return;
-
-	set_vm_flush_reset_perms(mod->core_layout.base);
-	set_vm_flush_reset_perms(mod->init_layout.base);
-	frob_text(&mod->core_layout, set_memory_ro);
-
-	frob_rodata(&mod->core_layout, set_memory_ro);
-	frob_text(&mod->init_layout, set_memory_ro);
-	frob_rodata(&mod->init_layout, set_memory_ro);
-
-	if (after_init)
-		frob_ro_after_init(&mod->core_layout, set_memory_ro);
-}
-
-static void module_enable_nx(const struct module *mod)
-{
-	frob_rodata(&mod->core_layout, set_memory_nx);
-	frob_ro_after_init(&mod->core_layout, set_memory_nx);
-	frob_writable_data(&mod->core_layout, set_memory_nx);
-	frob_rodata(&mod->init_layout, set_memory_nx);
-	frob_writable_data(&mod->init_layout, set_memory_nx);
-}
-
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-				       char *secstrings, struct module *mod)
-{
-	const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR;
-	int i;
-
-	for (i = 0; i < hdr->e_shnum; i++) {
-		if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
-			pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
-				mod->name, secstrings + sechdrs[i].sh_name, i);
-			return -ENOEXEC;
-		}
-	}
-
-	return 0;
-}
-
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static void module_enable_nx(const struct module *mod) { }
-static void module_enable_ro(const struct module *mod, bool after_init) {}
-static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-				       char *secstrings, struct module *mod)
-{
-	return 0;
-}
-#endif /*  CONFIG_STRICT_MODULE_RWX */
-
 void __weak module_memfree(void *module_region)
 {
 	/*
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..7949dfd449c2
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include "internal.h"
+
+static void frob_rodata(const struct module_layout *layout,
+		 int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON(!PAGE_ALIGNED(layout->base));
+	BUG_ON(!PAGE_ALIGNED(layout->text_size));
+	BUG_ON(!PAGE_ALIGNED(layout->ro_size));
+	set_memory((unsigned long)layout->base + layout->text_size,
+		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
+}
+
+static void frob_ro_after_init(const struct module_layout *layout,
+			int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON(!PAGE_ALIGNED(layout->base));
+	BUG_ON(!PAGE_ALIGNED(layout->ro_size));
+	BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
+static void frob_writable_data(const struct module_layout *layout,
+			int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON(!PAGE_ALIGNED(layout->base));
+	BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size));
+	BUG_ON(!PAGE_ALIGNED(layout->size));
+	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
+}
+
+void module_enable_ro(const struct module *mod, bool after_init)
+{
+	if (!rodata_enabled)
+		return;
+
+	set_vm_flush_reset_perms(mod->core_layout.base);
+	set_vm_flush_reset_perms(mod->init_layout.base);
+	frob_text(&mod->core_layout, set_memory_ro);
+
+	frob_rodata(&mod->core_layout, set_memory_ro);
+	frob_text(&mod->init_layout, set_memory_ro);
+	frob_rodata(&mod->init_layout, set_memory_ro);
+
+	if (after_init)
+		frob_ro_after_init(&mod->core_layout, set_memory_ro);
+}
+
+void module_enable_nx(const struct module *mod)
+{
+	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_ro_after_init(&mod->core_layout, set_memory_nx);
+	frob_writable_data(&mod->core_layout, set_memory_nx);
+	frob_rodata(&mod->init_layout, set_memory_nx);
+	frob_writable_data(&mod->init_layout, set_memory_nx);
+}
+
+int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+				char *secstrings, struct module *mod)
+{
+	const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+	int i;
+
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+			pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+			       mod->name, secstrings + sechdrs[i].sh_name, i);
+			return -ENOEXEC;
+		}
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 0c1e42805c25c87eb7a6f3b18bdbf3b3b7840aff Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:37 +0000
Subject: module: Move extra signature support out of core code

No functional change.

This patch migrates additional module signature check
code from core module code into kernel/module/signing.c.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h |  9 +++++
 kernel/module/main.c     | 87 ------------------------------------------------
 kernel/module/signing.c  | 77 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index a6895bb5598a..d6f646a5da41 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -158,3 +158,12 @@ static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 	return 0;
 }
 #endif /* CONFIG_STRICT_MODULE_RWX */
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+	return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d55a2a8338a1..c349c91e53fa 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -23,7 +23,6 @@
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
 #include <linux/proc_fs.h>
-#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
@@ -127,28 +126,6 @@ static void module_assert_mutex_or_preempt(void)
 #endif
 }
 
-#ifdef CONFIG_MODULE_SIG
-static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
-module_param(sig_enforce, bool_enable_only, 0644);
-
-void set_module_sig_enforced(void)
-{
-	sig_enforce = true;
-}
-#else
-#define sig_enforce false
-#endif
-
-/*
- * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
- * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
- */
-bool is_module_sig_enforced(void)
-{
-	return sig_enforce;
-}
-EXPORT_SYMBOL(is_module_sig_enforced);
-
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
@@ -2569,70 +2546,6 @@ static inline void kmemleak_load_module(const struct module *mod,
 }
 #endif
 
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info, int flags)
-{
-	int err = -ENODATA;
-	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
-	const char *reason;
-	const void *mod = info->hdr;
-	bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
-				       MODULE_INIT_IGNORE_VERMAGIC);
-	/*
-	 * Do not allow mangled modules as a module with version information
-	 * removed is no longer the module that was signed.
-	 */
-	if (!mangled_module &&
-	    info->len > markerlen &&
-	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
-		/* We truncate the module to discard the signature */
-		info->len -= markerlen;
-		err = mod_verify_sig(mod, info);
-		if (!err) {
-			info->sig_ok = true;
-			return 0;
-		}
-	}
-
-	/*
-	 * We don't permit modules to be loaded into the trusted kernels
-	 * without a valid signature on them, but if we're not enforcing,
-	 * certain errors are non-fatal.
-	 */
-	switch (err) {
-	case -ENODATA:
-		reason = "unsigned module";
-		break;
-	case -ENOPKG:
-		reason = "module with unsupported crypto";
-		break;
-	case -ENOKEY:
-		reason = "module with unavailable key";
-		break;
-
-	default:
-		/*
-		 * All other errors are fatal, including lack of memory,
-		 * unparseable signatures, and signature check failures --
-		 * even if signatures aren't required.
-		 */
-		return err;
-	}
-
-	if (is_module_sig_enforced()) {
-		pr_notice("Loading of %s is rejected\n", reason);
-		return -EKEYREJECTED;
-	}
-
-	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info, int flags)
-{
-	return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
 static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
 {
 #if defined(CONFIG_64BIT)
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
index 8aeb6d2ee94b..85c8999dfecf 100644
--- a/kernel/module/signing.c
+++ b/kernel/module/signing.c
@@ -11,9 +11,29 @@
 #include <linux/module_signature.h>
 #include <linux/string.h>
 #include <linux/verification.h>
+#include <linux/security.h>
 #include <crypto/public_key.h>
+#include <uapi/linux/module.h>
 #include "internal.h"
 
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+	return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+	sig_enforce = true;
+}
+
 /*
  * Verify the signature on a module.
  */
@@ -43,3 +63,60 @@ int mod_verify_sig(const void *mod, struct load_info *info)
 				      VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
+
+int module_sig_check(struct load_info *info, int flags)
+{
+	int err = -ENODATA;
+	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+	const char *reason;
+	const void *mod = info->hdr;
+	bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+				       MODULE_INIT_IGNORE_VERMAGIC);
+	/*
+	 * Do not allow mangled modules as a module with version information
+	 * removed is no longer the module that was signed.
+	 */
+	if (!mangled_module &&
+	    info->len > markerlen &&
+	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+		/* We truncate the module to discard the signature */
+		info->len -= markerlen;
+		err = mod_verify_sig(mod, info);
+		if (!err) {
+			info->sig_ok = true;
+			return 0;
+		}
+	}
+
+	/*
+	 * We don't permit modules to be loaded into the trusted kernels
+	 * without a valid signature on them, but if we're not enforcing,
+	 * certain errors are non-fatal.
+	 */
+	switch (err) {
+	case -ENODATA:
+		reason = "unsigned module";
+		break;
+	case -ENOPKG:
+		reason = "module with unsupported crypto";
+		break;
+	case -ENOKEY:
+		reason = "module with unavailable key";
+		break;
+
+	default:
+		/*
+		 * All other errors are fatal, including lack of memory,
+		 * unparseable signatures, and signature check failures --
+		 * even if signatures aren't required.
+		 */
+		return err;
+	}
+
+	if (is_module_sig_enforced()) {
+		pr_notice("Loading of %s is rejected\n", reason);
+		return -EKEYREJECTED;
+	}
+
+	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
-- 
cgit v1.2.3


From 473c84d1856e83faebf059a52a8e49bdb89026d3 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:38 +0000
Subject: module: Move kmemleak support to a separate file

No functional change.

This patch migrates kmemleak code out of core module
code into kernel/module/debug_kmemleak.c

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile         |  1 +
 kernel/module/debug_kmemleak.c | 30 ++++++++++++++++++++++++++++++
 kernel/module/internal.h       |  7 +++++++
 kernel/module/main.c           | 27 ---------------------------
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 kernel/module/debug_kmemleak.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index d313c8472cb3..12388627725c 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_MODULE_SIG) += signing.o
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
 obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..12a569d361e8
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+			  const struct load_info *info)
+{
+	unsigned int i;
+
+	/* only scan the sections containing data */
+	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+	for (i = 1; i < info->hdr->e_shnum; i++) {
+		/* Scan all writable sections that's not executable */
+		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+			continue;
+
+		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+				   info->sechdrs[i].sh_size, GFP_KERNEL);
+	}
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index d6f646a5da41..b0c360839f63 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -167,3 +167,10 @@ static inline int module_sig_check(struct load_info *info, int flags)
 	return 0;
 }
 #endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+					const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c349c91e53fa..d400c477c35a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2519,33 +2519,6 @@ bool __weak module_exit_section(const char *name)
 	return strstarts(name, ".exit");
 }
 
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
-				 const struct load_info *info)
-{
-	unsigned int i;
-
-	/* only scan the sections containing data */
-	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
-	for (i = 1; i < info->hdr->e_shnum; i++) {
-		/* Scan all writable sections that's not executable */
-		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
-		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
-		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
-			continue;
-
-		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
-				   info->sechdrs[i].sh_size, GFP_KERNEL);
-	}
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
-					const struct load_info *info)
-{
-}
-#endif
-
 static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
 {
 #if defined(CONFIG_64BIT)
-- 
cgit v1.2.3


From 91fb02f31505dc22262b13a129550f470ab90a79 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:39 +0000
Subject: module: Move kallsyms support into a separate file

No functional change.

This patch migrates kallsyms code out of core module
code kernel/module/kallsyms.c

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile   |   1 +
 kernel/module/internal.h |  29 +++
 kernel/module/kallsyms.c | 502 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/module/main.c     | 531 +----------------------------------------------
 4 files changed, 538 insertions(+), 525 deletions(-)
 create mode 100644 kernel/module/kallsyms.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 12388627725c..9901bed3ab5b 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
 obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
 obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index b0c360839f63..44ca05b9eb8f 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -68,6 +68,19 @@ struct load_info {
 };
 
 int mod_verify_sig(const void *mod, struct load_info *info);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
+		       unsigned int section);
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+	return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+	return sym->value;
+#endif
+}
 
 #ifdef CONFIG_LIVEPATCH
 int copy_module_elf(struct module *mod, struct load_info *info);
@@ -174,3 +187,19 @@ void kmemleak_load_module(const struct module *mod, const struct load_info *info
 static inline void kmemleak_load_module(const struct module *mod,
 					const struct load_info *info) { }
 #endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..1b0780e20aab
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+							  const struct kernel_symbol *start,
+							  const struct kernel_symbol *stop)
+{
+	return bsearch(name, start, stop - start,
+			sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+		       const struct module *mod)
+{
+	const struct kernel_symbol *ks;
+
+	if (!mod)
+		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+	else
+		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+	return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+	const Elf_Shdr *sechdrs = info->sechdrs;
+
+	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+			return 'v';
+		else
+			return 'w';
+	}
+	if (sym->st_shndx == SHN_UNDEF)
+		return 'U';
+	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+		return 'a';
+	if (sym->st_shndx >= SHN_LORESERVE)
+		return '?';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+		return 't';
+	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+	    sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+			return 'r';
+		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 'g';
+		else
+			return 'd';
+	}
+	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+			return 's';
+		else
+			return 'b';
+	}
+	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+		      ".debug")) {
+		return 'n';
+	}
+	return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+			   unsigned int shnum, unsigned int pcpundx)
+{
+	const Elf_Shdr *sec;
+
+	if (src->st_shndx == SHN_UNDEF ||
+	    src->st_shndx >= shnum ||
+	    !src->st_name)
+		return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+	if (src->st_shndx == pcpundx)
+		return true;
+#endif
+
+	sec = sechdrs + src->st_shndx;
+	if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+	    || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+		return false;
+
+	return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+	Elf_Shdr *strsect = info->sechdrs + info->index.str;
+	const Elf_Sym *src;
+	unsigned int i, nsrc, ndst, strtab_size = 0;
+
+	/* Put symbol section at end of init part of module. */
+	symsect->sh_flags |= SHF_ALLOC;
+	symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect,
+						info->index.sym) | INIT_OFFSET_MASK;
+	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+	src = (void *)info->hdr + symsect->sh_offset;
+	nsrc = symsect->sh_size / sizeof(*src);
+
+	/* Compute total space required for the core symbols' strtab. */
+	for (ndst = i = 0; i < nsrc; i++) {
+		if (i == 0 || is_livepatch_module(mod) ||
+		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
+			strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+			ndst++;
+		}
+	}
+
+	/* Append room for core symbols at end of core part. */
+	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->core_layout.size += strtab_size;
+	info->core_typeoffs = mod->core_layout.size;
+	mod->core_layout.size += ndst * sizeof(char);
+	mod->core_layout.size = debug_align(mod->core_layout.size);
+
+	/* Put string table section at end of init part of module. */
+	strsect->sh_flags |= SHF_ALLOC;
+	strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect,
+						info->index.str) | INIT_OFFSET_MASK;
+	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+	/* We'll tack temporary mod_kallsyms on the end. */
+	mod->init_layout.size = ALIGN(mod->init_layout.size,
+				      __alignof__(struct mod_kallsyms));
+	info->mod_kallsyms_init_off = mod->init_layout.size;
+	mod->init_layout.size += sizeof(struct mod_kallsyms);
+	info->init_typeoffs = mod->init_layout.size;
+	mod->init_layout.size += nsrc * sizeof(char);
+	mod->init_layout.size = debug_align(mod->init_layout.size);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section.  Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+	unsigned int i, ndst;
+	const Elf_Sym *src;
+	Elf_Sym *dst;
+	char *s;
+	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+
+	/* Set up to point into init section. */
+	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+	/* The following is safe since this pointer cannot change */
+	mod->kallsyms->symtab = (void *)symsec->sh_addr;
+	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	/* Make sure we get permanent strtab: don't use info->strtab. */
+	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
+
+	/*
+	 * Now populate the cut down core kallsyms for after init
+	 * and set types up while we still have access to sections.
+	 */
+	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
+	src = mod->kallsyms->symtab;
+	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+		mod->kallsyms->typetab[i] = elf_type(src + i, info);
+		if (i == 0 || is_livepatch_module(mod) ||
+		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+				   info->index.pcpu)) {
+			mod->core_kallsyms.typetab[ndst] =
+			    mod->kallsyms->typetab[i];
+			dst[ndst] = src[i];
+			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+			s += strscpy(s, &mod->kallsyms->strtab[src[i].st_name],
+				     KSYM_NAME_LEN) + 1;
+		}
+	}
+	mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+	const Elf_Shdr *sechdr;
+	unsigned int i;
+
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		sechdr = &info->sechdrs[i];
+		if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+		    !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+					sechdr->sh_size))
+			break;
+	}
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+	if (str[0] == '.' && str[1] == 'L')
+		return true;
+	return str[0] == '$' && strchr("axtd", str[1]) &&
+	       (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+					unsigned long addr,
+					unsigned long *size,
+					unsigned long *offset)
+{
+	unsigned int i, best = 0;
+	unsigned long nextval, bestval;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+	/* At worse, next value is at end of module */
+	if (within_module_init(addr, mod))
+		nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size;
+	else
+		nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size;
+
+	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+	/*
+	 * Scan for closest preceding symbol, and next symbol. (ELF
+	 * starts real symbols at 1).
+	 */
+	for (i = 1; i < kallsyms->num_symtab; i++) {
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+		unsigned long thisval = kallsyms_symbol_value(sym);
+
+		if (sym->st_shndx == SHN_UNDEF)
+			continue;
+
+		/*
+		 * We ignore unnamed symbols: they're uninformative
+		 * and inserted at a whim.
+		 */
+		if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+		    is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+			continue;
+
+		if (thisval <= addr && thisval > bestval) {
+			best = i;
+			bestval = thisval;
+		}
+		if (thisval > addr && thisval < nextval)
+			nextval = thisval;
+	}
+
+	if (!best)
+		return NULL;
+
+	if (size)
+		*size = nextval - bestval;
+	if (offset)
+		*offset = addr - bestval;
+
+	return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+						     void *ptr)
+{
+	return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution.  NULL means not found.  Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption.
+ */
+const char *module_address_lookup(unsigned long addr,
+				  unsigned long *size,
+			    unsigned long *offset,
+			    char **modname,
+			    const unsigned char **modbuildid,
+			    char *namebuf)
+{
+	const char *ret = NULL;
+	struct module *mod;
+
+	preempt_disable();
+	mod = __module_address(addr);
+	if (mod) {
+		if (modname)
+			*modname = mod->name;
+		if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+			*modbuildid = mod->build_id;
+#else
+			*modbuildid = NULL;
+#endif
+		}
+
+		ret = find_kallsyms_symbol(mod, addr, size, offset);
+	}
+	/* Make a copy in here where it's safe */
+	if (ret) {
+		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+		ret = namebuf;
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (within_module(addr, mod)) {
+			const char *sym;
+
+			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+			if (!sym)
+				goto out;
+
+			strscpy(symname, sym, KSYM_NAME_LEN);
+			preempt_enable();
+			return 0;
+		}
+	}
+out:
+	preempt_enable();
+	return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+			       unsigned long *offset, char *modname, char *name)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		if (within_module(addr, mod)) {
+			const char *sym;
+
+			sym = find_kallsyms_symbol(mod, addr, size, offset);
+			if (!sym)
+				goto out;
+			if (modname)
+				strscpy(modname, mod->name, MODULE_NAME_LEN);
+			if (name)
+				strscpy(name, sym, KSYM_NAME_LEN);
+			preempt_enable();
+			return 0;
+		}
+	}
+out:
+	preempt_enable();
+	return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *name, char *module_name, int *exported)
+{
+	struct module *mod;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod, &modules, list) {
+		struct mod_kallsyms *kallsyms;
+
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		kallsyms = rcu_dereference_sched(mod->kallsyms);
+		if (symnum < kallsyms->num_symtab) {
+			const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+			*value = kallsyms_symbol_value(sym);
+			*type = kallsyms->typetab[symnum];
+			strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+			strscpy(module_name, mod->name, MODULE_NAME_LEN);
+			*exported = is_exported(name, *value, mod);
+			preempt_enable();
+			return 0;
+		}
+		symnum -= kallsyms->num_symtab;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+	unsigned int i;
+	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+
+	for (i = 0; i < kallsyms->num_symtab; i++) {
+		const Elf_Sym *sym = &kallsyms->symtab[i];
+
+		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+		    sym->st_shndx != SHN_UNDEF)
+			return kallsyms_symbol_value(sym);
+	}
+	return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	struct module *mod;
+	char *colon;
+	unsigned long ret = 0;
+
+	/* Don't lock: we're in enough trouble already. */
+	preempt_disable();
+	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+		if ((mod = find_module_all(name, colon - name, false)) != NULL)
+			ret = find_kallsyms_symbol_value(mod, colon + 1);
+	} else {
+		list_for_each_entry_rcu(mod, &modules, list) {
+			if (mod->state == MODULE_STATE_UNFORMED)
+				continue;
+			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
+				break;
+		}
+	}
+	preempt_enable();
+	return ret;
+}
+
+#ifdef CONFIG_LIVEPATCH
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		/* We hold module_mutex: no need for rcu_dereference_sched */
+		struct mod_kallsyms *kallsyms = mod->kallsyms;
+
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+		for (i = 0; i < kallsyms->num_symtab; i++) {
+			const Elf_Sym *sym = &kallsyms->symtab[i];
+
+			if (sym->st_shndx == SHN_UNDEF)
+				continue;
+
+			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+				 mod, kallsyms_symbol_value(sym));
+			if (ret != 0)
+				goto out;
+		}
+	}
+out:
+	mutex_unlock(&module_mutex);
+	return ret;
+}
+#endif /* CONFIG_LIVEPATCH */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d400c477c35a..4d74988e1814 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -288,15 +288,6 @@ static bool check_exported_symbol(const struct symsearch *syms,
 	return true;
 }
 
-static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
-{
-#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
-	return (unsigned long)offset_to_ptr(&sym->value_offset);
-#else
-	return sym->value;
-#endif
-}
-
 static const char *kernel_symbol_name(const struct kernel_symbol *sym)
 {
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
@@ -317,7 +308,7 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
 #endif
 }
 
-static int cmp_name(const void *name, const void *sym)
+int cmp_name(const void *name, const void *sym)
 {
 	return strcmp(name, kernel_symbol_name(sym));
 }
@@ -387,8 +378,8 @@ static bool find_symbol(struct find_symbol_arg *fsa)
  * Search for module by name: must hold module_mutex (or preempt disabled
  * for read-only access).
  */
-static struct module *find_module_all(const char *name, size_t len,
-				      bool even_unformed)
+struct module *find_module_all(const char *name, size_t len,
+			       bool even_unformed)
 {
 	struct module *mod;
 
@@ -1294,13 +1285,6 @@ resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
-	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-#endif
-
 /*
  * /sys/module/foo/sections stuff
  * J. Corbet <corbet@lwn.net>
@@ -2065,7 +2049,7 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod,
 }
 
 /* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
+long module_get_offset(struct module *mod, unsigned int *size,
 		       Elf_Shdr *sechdr, unsigned int section)
 {
 	long ret;
@@ -2121,7 +2105,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || s->sh_entsize != ~0UL
 			    || module_init_layout_section(sname))
 				continue;
-			s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
+			s->sh_entsize = module_get_offset(mod, &mod->core_layout.size, s, i);
 			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
@@ -2154,7 +2138,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			    || s->sh_entsize != ~0UL
 			    || !module_init_layout_section(sname))
 				continue;
-			s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
+			s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i)
 					 | INIT_OFFSET_MASK);
 			pr_debug("\t%s\n", sname);
 		}
@@ -2267,228 +2251,6 @@ static void free_modinfo(struct module *mod)
 	}
 }
 
-#ifdef CONFIG_KALLSYMS
-
-/* Lookup exported symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_exported_symbol(const char *name,
-							  const struct kernel_symbol *start,
-							  const struct kernel_symbol *stop)
-{
-	return bsearch(name, start, stop - start,
-			sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
-		       const struct module *mod)
-{
-	const struct kernel_symbol *ks;
-	if (!mod)
-		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
-	else
-		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
-
-	return ks != NULL && kernel_symbol_value(ks) == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
-	const Elf_Shdr *sechdrs = info->sechdrs;
-
-	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
-		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
-			return 'v';
-		else
-			return 'w';
-	}
-	if (sym->st_shndx == SHN_UNDEF)
-		return 'U';
-	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
-		return 'a';
-	if (sym->st_shndx >= SHN_LORESERVE)
-		return '?';
-	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
-		return 't';
-	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
-	    && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
-		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
-			return 'r';
-		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
-			return 'g';
-		else
-			return 'd';
-	}
-	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
-			return 's';
-		else
-			return 'b';
-	}
-	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
-		      ".debug")) {
-		return 'n';
-	}
-	return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
-			unsigned int shnum, unsigned int pcpundx)
-{
-	const Elf_Shdr *sec;
-
-	if (src->st_shndx == SHN_UNDEF
-	    || src->st_shndx >= shnum
-	    || !src->st_name)
-		return false;
-
-#ifdef CONFIG_KALLSYMS_ALL
-	if (src->st_shndx == pcpundx)
-		return true;
-#endif
-
-	sec = sechdrs + src->st_shndx;
-	if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
-	    || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
-	    || (sec->sh_entsize & INIT_OFFSET_MASK))
-		return false;
-
-	return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep.  This is simple, but has the effect of making multiple
- * copies of duplicates.  We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
-	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
-	Elf_Shdr *strsect = info->sechdrs + info->index.str;
-	const Elf_Sym *src;
-	unsigned int i, nsrc, ndst, strtab_size = 0;
-
-	/* Put symbol section at end of init part of module. */
-	symsect->sh_flags |= SHF_ALLOC;
-	symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
-					 info->index.sym) | INIT_OFFSET_MASK;
-	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
-	src = (void *)info->hdr + symsect->sh_offset;
-	nsrc = symsect->sh_size / sizeof(*src);
-
-	/* Compute total space required for the core symbols' strtab. */
-	for (ndst = i = 0; i < nsrc; i++) {
-		if (i == 0 || is_livepatch_module(mod) ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
-				   info->index.pcpu)) {
-			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
-			ndst++;
-		}
-	}
-
-	/* Append room for core symbols at end of core part. */
-	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
-	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
-	mod->core_layout.size += strtab_size;
-	info->core_typeoffs = mod->core_layout.size;
-	mod->core_layout.size += ndst * sizeof(char);
-	mod->core_layout.size = debug_align(mod->core_layout.size);
-
-	/* Put string table section at end of init part of module. */
-	strsect->sh_flags |= SHF_ALLOC;
-	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
-					 info->index.str) | INIT_OFFSET_MASK;
-	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-
-	/* We'll tack temporary mod_kallsyms on the end. */
-	mod->init_layout.size = ALIGN(mod->init_layout.size,
-				      __alignof__(struct mod_kallsyms));
-	info->mod_kallsyms_init_off = mod->init_layout.size;
-	mod->init_layout.size += sizeof(struct mod_kallsyms);
-	info->init_typeoffs = mod->init_layout.size;
-	mod->init_layout.size += nsrc * sizeof(char);
-	mod->init_layout.size = debug_align(mod->init_layout.size);
-}
-
-/*
- * We use the full symtab and strtab which layout_symtab arranged to
- * be appended to the init section.  Later we switch to the cut-down
- * core-only ones.
- */
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-	unsigned int i, ndst;
-	const Elf_Sym *src;
-	Elf_Sym *dst;
-	char *s;
-	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
-	/* Set up to point into init section. */
-	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
-
-	mod->kallsyms->symtab = (void *)symsec->sh_addr;
-	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
-	/* Make sure we get permanent strtab: don't use info->strtab. */
-	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
-	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
-
-	/*
-	 * Now populate the cut down core kallsyms for after init
-	 * and set types up while we still have access to sections.
-	 */
-	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
-	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
-	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
-	src = mod->kallsyms->symtab;
-	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
-		mod->kallsyms->typetab[i] = elf_type(src + i, info);
-		if (i == 0 || is_livepatch_module(mod) ||
-		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
-				   info->index.pcpu)) {
-			mod->core_kallsyms.typetab[ndst] =
-			    mod->kallsyms->typetab[i];
-			dst[ndst] = src[i];
-			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
-			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
-				     KSYM_NAME_LEN) + 1;
-		}
-	}
-	mod->core_kallsyms.num_symtab = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-	const Elf_Shdr *sechdr;
-	unsigned int i;
-
-	for (i = 0; i < info->hdr->e_shnum; i++) {
-		sechdr = &info->sechdrs[i];
-		if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
-		    !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
-					sechdr->sh_size))
-			break;
-	}
-}
-#else
-static void init_build_id(struct module *mod, const struct load_info *info)
-{
-}
-#endif
-
 static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
 {
 	if (!debug)
@@ -3799,287 +3561,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
 	return ((void *)addr >= start && (void *)addr < start + size);
 }
 
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
-	if (str[0] == '.' && str[1] == 'L')
-		return true;
-	return str[0] == '$' && strchr("axtd", str[1])
-	       && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
-{
-	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
-}
-
-/*
- * Given a module and address, find the corresponding symbol and return its name
- * while providing its size and offset if needed.
- */
-static const char *find_kallsyms_symbol(struct module *mod,
-					unsigned long addr,
-					unsigned long *size,
-					unsigned long *offset)
-{
-	unsigned int i, best = 0;
-	unsigned long nextval, bestval;
-	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
-	/* At worse, next value is at end of module */
-	if (within_module_init(addr, mod))
-		nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
-	else
-		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
-
-	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
-
-	/*
-	 * Scan for closest preceding symbol, and next symbol. (ELF
-	 * starts real symbols at 1).
-	 */
-	for (i = 1; i < kallsyms->num_symtab; i++) {
-		const Elf_Sym *sym = &kallsyms->symtab[i];
-		unsigned long thisval = kallsyms_symbol_value(sym);
-
-		if (sym->st_shndx == SHN_UNDEF)
-			continue;
-
-		/*
-		 * We ignore unnamed symbols: they're uninformative
-		 * and inserted at a whim.
-		 */
-		if (*kallsyms_symbol_name(kallsyms, i) == '\0'
-		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
-			continue;
-
-		if (thisval <= addr && thisval > bestval) {
-			best = i;
-			bestval = thisval;
-		}
-		if (thisval > addr && thisval < nextval)
-			nextval = thisval;
-	}
-
-	if (!best)
-		return NULL;
-
-	if (size)
-		*size = nextval - bestval;
-	if (offset)
-		*offset = addr - bestval;
-
-	return kallsyms_symbol_name(kallsyms, best);
-}
-
-void * __weak dereference_module_function_descriptor(struct module *mod,
-						     void *ptr)
-{
-	return ptr;
-}
-
-/*
- * For kallsyms to ask for address resolution.  NULL means not found.  Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
- */
-const char *module_address_lookup(unsigned long addr,
-			    unsigned long *size,
-			    unsigned long *offset,
-			    char **modname,
-			    const unsigned char **modbuildid,
-			    char *namebuf)
-{
-	const char *ret = NULL;
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_address(addr);
-	if (mod) {
-		if (modname)
-			*modname = mod->name;
-		if (modbuildid) {
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
-			*modbuildid = mod->build_id;
-#else
-			*modbuildid = NULL;
-#endif
-		}
-
-		ret = find_kallsyms_symbol(mod, addr, size, offset);
-	}
-	/* Make a copy in here where it's safe */
-	if (ret) {
-		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
-		ret = namebuf;
-	}
-	preempt_enable();
-
-	return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod)) {
-			const char *sym;
-
-			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
-			if (!sym)
-				goto out;
-
-			strlcpy(symname, sym, KSYM_NAME_LEN);
-			preempt_enable();
-			return 0;
-		}
-	}
-out:
-	preempt_enable();
-	return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
-			unsigned long *offset, char *modname, char *name)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod)) {
-			const char *sym;
-
-			sym = find_kallsyms_symbol(mod, addr, size, offset);
-			if (!sym)
-				goto out;
-			if (modname)
-				strlcpy(modname, mod->name, MODULE_NAME_LEN);
-			if (name)
-				strlcpy(name, sym, KSYM_NAME_LEN);
-			preempt_enable();
-			return 0;
-		}
-	}
-out:
-	preempt_enable();
-	return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-			char *name, char *module_name, int *exported)
-{
-	struct module *mod;
-
-	preempt_disable();
-	list_for_each_entry_rcu(mod, &modules, list) {
-		struct mod_kallsyms *kallsyms;
-
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		kallsyms = rcu_dereference_sched(mod->kallsyms);
-		if (symnum < kallsyms->num_symtab) {
-			const Elf_Sym *sym = &kallsyms->symtab[symnum];
-
-			*value = kallsyms_symbol_value(sym);
-			*type = kallsyms->typetab[symnum];
-			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
-			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
-			*exported = is_exported(name, *value, mod);
-			preempt_enable();
-			return 0;
-		}
-		symnum -= kallsyms->num_symtab;
-	}
-	preempt_enable();
-	return -ERANGE;
-}
-
-/* Given a module and name of symbol, find and return the symbol's value */
-static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
-{
-	unsigned int i;
-	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
-
-	for (i = 0; i < kallsyms->num_symtab; i++) {
-		const Elf_Sym *sym = &kallsyms->symtab[i];
-
-		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
-		    sym->st_shndx != SHN_UNDEF)
-			return kallsyms_symbol_value(sym);
-	}
-	return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
-	struct module *mod;
-	char *colon;
-	unsigned long ret = 0;
-
-	/* Don't lock: we're in enough trouble already. */
-	preempt_disable();
-	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
-		if ((mod = find_module_all(name, colon - name, false)) != NULL)
-			ret = find_kallsyms_symbol_value(mod, colon+1);
-	} else {
-		list_for_each_entry_rcu(mod, &modules, list) {
-			if (mod->state == MODULE_STATE_UNFORMED)
-				continue;
-			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
-				break;
-		}
-	}
-	preempt_enable();
-	return ret;
-}
-
-#ifdef CONFIG_LIVEPATCH
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
-					     struct module *, unsigned long),
-				   void *data)
-{
-	struct module *mod;
-	unsigned int i;
-	int ret = 0;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(mod, &modules, list) {
-		/* We hold module_mutex: no need for rcu_dereference_sched */
-		struct mod_kallsyms *kallsyms = mod->kallsyms;
-
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		for (i = 0; i < kallsyms->num_symtab; i++) {
-			const Elf_Sym *sym = &kallsyms->symtab[i];
-
-			if (sym->st_shndx == SHN_UNDEF)
-				continue;
-
-			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
-				 mod, kallsyms_symbol_value(sym));
-			if (ret != 0)
-				goto out;
-
-			cond_resched();
-		}
-	}
-out:
-	mutex_unlock(&module_mutex);
-	return ret;
-}
-#endif /* CONFIG_LIVEPATCH */
-#endif /* CONFIG_KALLSYMS */
-
 static void cfi_init(struct module *mod)
 {
 #ifdef CONFIG_CFI_CLANG
-- 
cgit v1.2.3


From 08126db5ff739fa011fc5b8af683ad759f2cba9a Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:40 +0000
Subject: module: kallsyms: Fix suspicious rcu usage

No functional change.

The purpose of this patch is to address the various Sparse warnings
due to the incorrect dereference/or access of an __rcu pointer.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/kallsyms.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 1b0780e20aab..a3da0686a2a6 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -171,14 +171,17 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
 
 	/* Set up to point into init section. */
-	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+	mod->kallsyms = (void __rcu *)mod->init_layout.base +
+		info->mod_kallsyms_init_off;
 
+	preempt_disable();
 	/* The following is safe since this pointer cannot change */
-	mod->kallsyms->symtab = (void *)symsec->sh_addr;
-	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+	rcu_dereference_sched(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
+	rcu_dereference_sched(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
 	/* Make sure we get permanent strtab: don't use info->strtab. */
-	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
-	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
+	rcu_dereference_sched(mod->kallsyms)->strtab =
+		(void *)info->sechdrs[info->index.str].sh_addr;
+	rcu_dereference_sched(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
 
 	/*
 	 * Now populate the cut down core kallsyms for after init
@@ -187,20 +190,22 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
 	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
 	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
-	src = mod->kallsyms->symtab;
-	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
-		mod->kallsyms->typetab[i] = elf_type(src + i, info);
+	src = rcu_dereference_sched(mod->kallsyms)->symtab;
+	for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) {
+		rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
 		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			mod->core_kallsyms.typetab[ndst] =
-			    mod->kallsyms->typetab[i];
+			    rcu_dereference_sched(mod->kallsyms)->typetab[i];
 			dst[ndst] = src[i];
 			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
-			s += strscpy(s, &mod->kallsyms->strtab[src[i].st_name],
+			s += strscpy(s,
+				     &rcu_dereference_sched(mod->kallsyms)->strtab[src[i].st_name],
 				     KSYM_NAME_LEN) + 1;
 		}
 	}
+	preempt_enable();
 	mod->core_kallsyms.num_symtab = ndst;
 }
 
@@ -478,11 +483,16 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 
 	mutex_lock(&module_mutex);
 	list_for_each_entry(mod, &modules, list) {
-		/* We hold module_mutex: no need for rcu_dereference_sched */
-		struct mod_kallsyms *kallsyms = mod->kallsyms;
+		struct mod_kallsyms *kallsyms;
 
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
+
+		/* Use rcu_dereference_sched() to remain compliant with the sparse tool */
+		preempt_disable();
+		kallsyms = rcu_dereference_sched(mod->kallsyms);
+		preempt_enable();
+
 		for (i = 0; i < kallsyms->num_symtab; i++) {
 			const Elf_Sym *sym = &kallsyms->symtab[i];
 
-- 
cgit v1.2.3


From 0ffc40f6c8ab684e694774ebc835b198398129a8 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:41 +0000
Subject: module: Move procfs support into a separate file

No functional change.

This patch migrates code that allows one to generate a
list of loaded/or linked modules via /proc when procfs
support is enabled into kernel/module/procfs.c.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile   |   1 +
 kernel/module/internal.h |   1 +
 kernel/module/main.c     | 131 +------------------------------------------
 kernel/module/procfs.c   | 142 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 130 deletions(-)
 create mode 100644 kernel/module/procfs.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 9901bed3ab5b..94296c98a67f 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
 obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 44ca05b9eb8f..6af40c2d145f 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -72,6 +72,7 @@ struct module *find_module_all(const char *name, size_t len, bool even_unformed)
 int cmp_name(const void *name, const void *sym);
 long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
 		       unsigned int section);
+char *module_flags(struct module *mod, char *buf);
 
 static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
 {
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 4d74988e1814..63b98452fd7d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -22,7 +22,6 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
-#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
@@ -805,31 +804,6 @@ out:
 	return ret;
 }
 
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
-	struct module_use *use;
-	int printed_something = 0;
-
-	seq_printf(m, " %i ", module_refcount(mod));
-
-	/*
-	 * Always include a trailing , so userspace can differentiate
-	 * between this and the old multi-field proc format.
-	 */
-	list_for_each_entry(use, &mod->source_list, source_list) {
-		printed_something = 1;
-		seq_printf(m, "%s,", use->source->name);
-	}
-
-	if (mod->init != NULL && mod->exit == NULL) {
-		printed_something = 1;
-		seq_puts(m, "[permanent],");
-	}
-
-	if (!printed_something)
-		seq_puts(m, "-");
-}
-
 void __symbol_put(const char *symbol)
 {
 	struct find_symbol_arg fsa = {
@@ -919,12 +893,6 @@ void module_put(struct module *module)
 EXPORT_SYMBOL(module_put);
 
 #else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
-	/* We don't know the usage count, or what modules are using. */
-	seq_puts(m, " - -");
-}
-
 static inline void module_unload_free(struct module *mod)
 {
 }
@@ -3596,7 +3564,7 @@ static void cfi_cleanup(struct module *mod)
 }
 
 /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
-static char *module_flags(struct module *mod, char *buf)
+char *module_flags(struct module *mod, char *buf)
 {
 	int bx = 0;
 
@@ -3619,103 +3587,6 @@ static char *module_flags(struct module *mod, char *buf)
 	return buf;
 }
 
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&module_mutex);
-	return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
-	struct module *mod = list_entry(p, struct module, list);
-	char buf[MODULE_FLAGS_BUF_SIZE];
-	void *value;
-
-	/* We always ignore unformed modules. */
-	if (mod->state == MODULE_STATE_UNFORMED)
-		return 0;
-
-	seq_printf(m, "%s %u",
-		   mod->name, mod->init_layout.size + mod->core_layout.size);
-	print_unload_info(m, mod);
-
-	/* Informative for users. */
-	seq_printf(m, " %s",
-		   mod->state == MODULE_STATE_GOING ? "Unloading" :
-		   mod->state == MODULE_STATE_COMING ? "Loading" :
-		   "Live");
-	/* Used by oprofile and other similar tools. */
-	value = m->private ? NULL : mod->core_layout.base;
-	seq_printf(m, " 0x%px", value);
-
-	/* Taints info */
-	if (mod->taints)
-		seq_printf(m, " %s", module_flags(mod, buf));
-
-	seq_puts(m, "\n");
-	return 0;
-}
-
-/*
- * Format: modulename size refcount deps address
- *
- * Where refcount is a number or -, and deps is a comma-separated list
- * of depends or -.
- */
-static const struct seq_operations modules_op = {
-	.start	= m_start,
-	.next	= m_next,
-	.stop	= m_stop,
-	.show	= m_show
-};
-
-/*
- * This also sets the "private" pointer to non-NULL if the
- * kernel pointers should be hidden (so you can just test
- * "m->private" to see if you should keep the values private).
- *
- * We use the same logic as for /proc/kallsyms.
- */
-static int modules_open(struct inode *inode, struct file *file)
-{
-	int err = seq_open(file, &modules_op);
-
-	if (!err) {
-		struct seq_file *m = file->private_data;
-		m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
-	}
-
-	return err;
-}
-
-static const struct proc_ops modules_proc_ops = {
-	.proc_flags	= PROC_ENTRY_PERMANENT,
-	.proc_open	= modules_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
-	proc_create("modules", 0, NULL, &modules_proc_ops);
-	return 0;
-}
-module_init(proc_modules_init);
-#endif
-
 /* Given an address, look for it in the module exception tables. */
 const struct exception_table_entry *search_module_extables(unsigned long addr)
 {
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..2717e130788e
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	struct module_use *use;
+	int printed_something = 0;
+
+	seq_printf(m, " %i ", module_refcount(mod));
+
+	/*
+	 * Always include a trailing , so userspace can differentiate
+	 * between this and the old multi-field proc format.
+	 */
+	list_for_each_entry(use, &mod->source_list, source_list) {
+		printed_something = 1;
+		seq_printf(m, "%s,", use->source->name);
+	}
+
+	if (mod->init && !mod->exit) {
+		printed_something = 1;
+		seq_puts(m, "[permanent],");
+	}
+
+	if (!printed_something)
+		seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+	/* We don't know the usage count, or what modules are using. */
+	seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&module_mutex);
+	return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+	struct module *mod = list_entry(p, struct module, list);
+	char buf[MODULE_FLAGS_BUF_SIZE];
+	void *value;
+
+	/* We always ignore unformed modules. */
+	if (mod->state == MODULE_STATE_UNFORMED)
+		return 0;
+
+	seq_printf(m, "%s %u",
+		   mod->name, mod->init_layout.size + mod->core_layout.size);
+	print_unload_info(m, mod);
+
+	/* Informative for users. */
+	seq_printf(m, " %s",
+		   mod->state == MODULE_STATE_GOING ? "Unloading" :
+		   mod->state == MODULE_STATE_COMING ? "Loading" :
+		   "Live");
+	/* Used by oprofile and other similar tools. */
+	value = m->private ? NULL : mod->core_layout.base;
+	seq_printf(m, " 0x%px", value);
+
+	/* Taints info */
+	if (mod->taints)
+		seq_printf(m, " %s", module_flags(mod, buf));
+
+	seq_puts(m, "\n");
+	return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+	int err = seq_open(file, &modules_op);
+
+	if (!err) {
+		struct seq_file *m = file->private_data;
+
+		m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+	}
+
+	return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+	.proc_flags	= PROC_ENTRY_PERMANENT,
+	.proc_open	= modules_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+	proc_create("modules", 0, NULL, &modules_proc_ops);
+	return 0;
+}
+module_init(proc_modules_init);
-- 
cgit v1.2.3


From 44c09535de4784f31d151aa1047efcf4797ca3cd Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:42 +0000
Subject: module: Move sysfs support into a separate file

No functional change.

This patch migrates module sysfs support out of core code into
kernel/module/sysfs.c. In addition simple code refactoring to
make this possible.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile   |   1 +
 kernel/module/internal.h |  21 +++
 kernel/module/main.c     | 469 +----------------------------------------------
 kernel/module/sysfs.c    | 436 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 461 insertions(+), 466 deletions(-)
 create mode 100644 kernel/module/sysfs.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 94296c98a67f..cf8dcdc6b55f 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 6af40c2d145f..62d749ef695e 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -34,6 +34,9 @@
 extern struct mutex module_mutex;
 extern struct list_head modules;
 
+extern struct module_attribute *modinfo_attrs[];
+extern size_t modinfo_attrs_count;
+
 /* Provided by the linker */
 extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
@@ -204,3 +207,21 @@ static inline void init_build_id(struct module *mod, const struct load_info *inf
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
 #endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+		    struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+			   	  const struct load_info *info,
+			   	  struct kernel_param *kparam,
+			   	  unsigned int num_params)
+{
+	return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 63b98452fd7d..0cd0590dd411 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -14,9 +14,7 @@
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/buildid.h>
-#include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/sysfs.h>
 #include <linux/kernel.h>
 #include <linux/kernel_read_file.h>
 #include <linux/slab.h>
@@ -989,7 +987,7 @@ static ssize_t show_taint(struct module_attribute *mattr,
 static struct module_attribute modinfo_taint =
 	__ATTR(taint, 0444, show_taint, NULL);
 
-static struct module_attribute *modinfo_attrs[] = {
+struct module_attribute *modinfo_attrs[] = {
 	&module_uevent,
 	&modinfo_version,
 	&modinfo_srcversion,
@@ -1003,6 +1001,8 @@ static struct module_attribute *modinfo_attrs[] = {
 	NULL,
 };
 
+size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
 static const char vermagic[] = VERMAGIC_STRING;
 
 static int try_to_force_load(struct module *mod, const char *reason)
@@ -1253,469 +1253,6 @@ resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
-	struct bin_attribute battr;
-	unsigned long address;
-};
-
-struct module_sect_attrs {
-	struct attribute_group grp;
-	unsigned int nsections;
-	struct module_sect_attr attrs[];
-};
-
-#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
-static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
-				struct bin_attribute *battr,
-				char *buf, loff_t pos, size_t count)
-{
-	struct module_sect_attr *sattr =
-		container_of(battr, struct module_sect_attr, battr);
-	char bounce[MODULE_SECT_READ_SIZE + 1];
-	size_t wrote;
-
-	if (pos != 0)
-		return -EINVAL;
-
-	/*
-	 * Since we're a binary read handler, we must account for the
-	 * trailing NUL byte that sprintf will write: if "buf" is
-	 * too small to hold the NUL, or the NUL is exactly the last
-	 * byte, the read will look like it got truncated by one byte.
-	 * Since there is no way to ask sprintf nicely to not write
-	 * the NUL, we have to use a bounce buffer.
-	 */
-	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
-			 kallsyms_show_value(file->f_cred)
-				? (void *)sattr->address : NULL);
-	count = min(count, wrote);
-	memcpy(buf, bounce, count);
-
-	return count;
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
-	unsigned int section;
-
-	for (section = 0; section < sect_attrs->nsections; section++)
-		kfree(sect_attrs->attrs[section].battr.attr.name);
-	kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
-	unsigned int nloaded = 0, i, size[2];
-	struct module_sect_attrs *sect_attrs;
-	struct module_sect_attr *sattr;
-	struct bin_attribute **gattr;
-
-	/* Count loaded sections and allocate structures */
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		if (!sect_empty(&info->sechdrs[i]))
-			nloaded++;
-	size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
-			sizeof(sect_attrs->grp.bin_attrs[0]));
-	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
-	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
-	if (sect_attrs == NULL)
-		return;
-
-	/* Setup section attributes. */
-	sect_attrs->grp.name = "sections";
-	sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
-
-	sect_attrs->nsections = 0;
-	sattr = &sect_attrs->attrs[0];
-	gattr = &sect_attrs->grp.bin_attrs[0];
-	for (i = 0; i < info->hdr->e_shnum; i++) {
-		Elf_Shdr *sec = &info->sechdrs[i];
-		if (sect_empty(sec))
-			continue;
-		sysfs_bin_attr_init(&sattr->battr);
-		sattr->address = sec->sh_addr;
-		sattr->battr.attr.name =
-			kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
-		if (sattr->battr.attr.name == NULL)
-			goto out;
-		sect_attrs->nsections++;
-		sattr->battr.read = module_sect_read;
-		sattr->battr.size = MODULE_SECT_READ_SIZE;
-		sattr->battr.attr.mode = 0400;
-		*(gattr++) = &(sattr++)->battr;
-	}
-	*gattr = NULL;
-
-	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
-		goto out;
-
-	mod->sect_attrs = sect_attrs;
-	return;
-  out:
-	free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
-	if (mod->sect_attrs) {
-		sysfs_remove_group(&mod->mkobj.kobj,
-				   &mod->sect_attrs->grp);
-		/*
-		 * We are positive that no one is using any sect attrs
-		 * at this point.  Deallocate immediately.
-		 */
-		free_sect_attrs(mod->sect_attrs);
-		mod->sect_attrs = NULL;
-	}
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
-	struct kobject *dir;
-	unsigned int notes;
-	struct bin_attribute attrs[];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
-				 struct bin_attribute *bin_attr,
-				 char *buf, loff_t pos, size_t count)
-{
-	/*
-	 * The caller checked the pos and count against our size.
-	 */
-	memcpy(buf, bin_attr->private + pos, count);
-	return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
-			     unsigned int i)
-{
-	if (notes_attrs->dir) {
-		while (i-- > 0)
-			sysfs_remove_bin_file(notes_attrs->dir,
-					      &notes_attrs->attrs[i]);
-		kobject_put(notes_attrs->dir);
-	}
-	kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
-	unsigned int notes, loaded, i;
-	struct module_notes_attrs *notes_attrs;
-	struct bin_attribute *nattr;
-
-	/* failed to create section attributes, so can't create notes */
-	if (!mod->sect_attrs)
-		return;
-
-	/* Count notes sections and allocate structures.  */
-	notes = 0;
-	for (i = 0; i < info->hdr->e_shnum; i++)
-		if (!sect_empty(&info->sechdrs[i]) &&
-		    (info->sechdrs[i].sh_type == SHT_NOTE))
-			++notes;
-
-	if (notes == 0)
-		return;
-
-	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
-			      GFP_KERNEL);
-	if (notes_attrs == NULL)
-		return;
-
-	notes_attrs->notes = notes;
-	nattr = &notes_attrs->attrs[0];
-	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
-		if (sect_empty(&info->sechdrs[i]))
-			continue;
-		if (info->sechdrs[i].sh_type == SHT_NOTE) {
-			sysfs_bin_attr_init(nattr);
-			nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
-			nattr->attr.mode = S_IRUGO;
-			nattr->size = info->sechdrs[i].sh_size;
-			nattr->private = (void *) info->sechdrs[i].sh_addr;
-			nattr->read = module_notes_read;
-			++nattr;
-		}
-		++loaded;
-	}
-
-	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
-	if (!notes_attrs->dir)
-		goto out;
-
-	for (i = 0; i < notes; ++i)
-		if (sysfs_create_bin_file(notes_attrs->dir,
-					  &notes_attrs->attrs[i]))
-			goto out;
-
-	mod->notes_attrs = notes_attrs;
-	return;
-
-  out:
-	free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
-	if (mod->notes_attrs)
-		free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
-				  const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
-				   const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
-	struct module_use *use;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list)
-		sysfs_remove_link(use->target->holders_dir, mod->name);
-	mutex_unlock(&module_mutex);
-#endif
-}
-
-static int add_usage_links(struct module *mod)
-{
-	int ret = 0;
-#ifdef CONFIG_MODULE_UNLOAD
-	struct module_use *use;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list) {
-		ret = sysfs_create_link(use->target->holders_dir,
-					&mod->mkobj.kobj, mod->name);
-		if (ret)
-			break;
-	}
-	mutex_unlock(&module_mutex);
-	if (ret)
-		del_usage_links(mod);
-#endif
-	return ret;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end);
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
-	struct module_attribute *attr;
-	struct module_attribute *temp_attr;
-	int error = 0;
-	int i;
-
-	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
-					(ARRAY_SIZE(modinfo_attrs) + 1)),
-					GFP_KERNEL);
-	if (!mod->modinfo_attrs)
-		return -ENOMEM;
-
-	temp_attr = mod->modinfo_attrs;
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
-		if (!attr->test || attr->test(mod)) {
-			memcpy(temp_attr, attr, sizeof(*temp_attr));
-			sysfs_attr_init(&temp_attr->attr);
-			error = sysfs_create_file(&mod->mkobj.kobj,
-					&temp_attr->attr);
-			if (error)
-				goto error_out;
-			++temp_attr;
-		}
-	}
-
-	return 0;
-
-error_out:
-	if (i > 0)
-		module_remove_modinfo_attrs(mod, --i);
-	else
-		kfree(mod->modinfo_attrs);
-	return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-	struct module_attribute *attr;
-	int i;
-
-	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
-		if (end >= 0 && i > end)
-			break;
-		/* pick a field to test for end of list */
-		if (!attr->attr.name)
-			break;
-		sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
-		if (attr->free)
-			attr->free(mod);
-	}
-	kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
-	DECLARE_COMPLETION_ONSTACK(c);
-	mod->mkobj.kobj_completion = &c;
-	kobject_put(&mod->mkobj.kobj);
-	wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
-	int err;
-	struct kobject *kobj;
-
-	if (!module_sysfs_initialized) {
-		pr_err("%s: module sysfs not initialized\n", mod->name);
-		err = -EINVAL;
-		goto out;
-	}
-
-	kobj = kset_find_obj(module_kset, mod->name);
-	if (kobj) {
-		pr_err("%s: module is already loaded\n", mod->name);
-		kobject_put(kobj);
-		err = -EINVAL;
-		goto out;
-	}
-
-	mod->mkobj.mod = mod;
-
-	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
-	mod->mkobj.kobj.kset = module_kset;
-	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
-				   "%s", mod->name);
-	if (err)
-		mod_kobject_put(mod);
-
-out:
-	return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
-			   const struct load_info *info,
-			   struct kernel_param *kparam,
-			   unsigned int num_params)
-{
-	int err;
-
-	err = mod_sysfs_init(mod);
-	if (err)
-		goto out;
-
-	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
-	if (!mod->holders_dir) {
-		err = -ENOMEM;
-		goto out_unreg;
-	}
-
-	err = module_param_sysfs_setup(mod, kparam, num_params);
-	if (err)
-		goto out_unreg_holders;
-
-	err = module_add_modinfo_attrs(mod);
-	if (err)
-		goto out_unreg_param;
-
-	err = add_usage_links(mod);
-	if (err)
-		goto out_unreg_modinfo_attrs;
-
-	add_sect_attrs(mod, info);
-	add_notes_attrs(mod, info);
-
-	return 0;
-
-out_unreg_modinfo_attrs:
-	module_remove_modinfo_attrs(mod, -1);
-out_unreg_param:
-	module_param_sysfs_remove(mod);
-out_unreg_holders:
-	kobject_put(mod->holders_dir);
-out_unreg:
-	mod_kobject_put(mod);
-out:
-	return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-	remove_notes_attrs(mod);
-	remove_sect_attrs(mod);
-	mod_kobject_put(mod);
-}
-
-static void init_param_lock(struct module *mod)
-{
-	mutex_init(&mod->param_lock);
-}
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
-			   const struct load_info *info,
-			   struct kernel_param *kparam,
-			   unsigned int num_params)
-{
-	return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod, int end)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-static void init_param_lock(struct module *mod)
-{
-}
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
-	del_usage_links(mod);
-	module_remove_modinfo_attrs(mod, -1);
-	module_param_sysfs_remove(mod);
-	kobject_put(mod->mkobj.drivers_dir);
-	kobject_put(mod->holders_dir);
-	mod_sysfs_fini(mod);
-}
-
 /*
  * LKM RO/NX protection: protect module's text/ro-data
  * from modification and any data from execution.
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..ce68f821dcd1
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,436 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attr {
+	struct bin_attribute battr;
+	unsigned long address;
+};
+
+struct module_sect_attrs {
+	struct attribute_group grp;
+	unsigned int nsections;
+	struct module_sect_attr attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+				struct bin_attribute *battr,
+				char *buf, loff_t pos, size_t count)
+{
+	struct module_sect_attr *sattr =
+		container_of(battr, struct module_sect_attr, battr);
+	char bounce[MODULE_SECT_READ_SIZE + 1];
+	size_t wrote;
+
+	if (pos != 0)
+		return -EINVAL;
+
+	/*
+	 * Since we're a binary read handler, we must account for the
+	 * trailing NUL byte that sprintf will write: if "buf" is
+	 * too small to hold the NUL, or the NUL is exactly the last
+	 * byte, the read will look like it got truncated by one byte.
+	 * Since there is no way to ask sprintf nicely to not write
+	 * the NUL, we have to use a bounce buffer.
+	 */
+	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+			  kallsyms_show_value(file->f_cred)
+				? (void *)sattr->address : NULL);
+	count = min(count, wrote);
+	memcpy(buf, bounce, count);
+
+	return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+	unsigned int section;
+
+	for (section = 0; section < sect_attrs->nsections; section++)
+		kfree(sect_attrs->attrs[section].battr.attr.name);
+	kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+	unsigned int nloaded = 0, i, size[2];
+	struct module_sect_attrs *sect_attrs;
+	struct module_sect_attr *sattr;
+	struct bin_attribute **gattr;
+
+	/* Count loaded sections and allocate structures */
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]))
+			nloaded++;
+	size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+			sizeof(sect_attrs->grp.bin_attrs[0]));
+	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+	if (!sect_attrs)
+		return;
+
+	/* Setup section attributes. */
+	sect_attrs->grp.name = "sections";
+	sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+
+	sect_attrs->nsections = 0;
+	sattr = &sect_attrs->attrs[0];
+	gattr = &sect_attrs->grp.bin_attrs[0];
+	for (i = 0; i < info->hdr->e_shnum; i++) {
+		Elf_Shdr *sec = &info->sechdrs[i];
+
+		if (sect_empty(sec))
+			continue;
+		sysfs_bin_attr_init(&sattr->battr);
+		sattr->address = sec->sh_addr;
+		sattr->battr.attr.name =
+			kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+		if (!sattr->battr.attr.name)
+			goto out;
+		sect_attrs->nsections++;
+		sattr->battr.read = module_sect_read;
+		sattr->battr.size = MODULE_SECT_READ_SIZE;
+		sattr->battr.attr.mode = 0400;
+		*(gattr++) = &(sattr++)->battr;
+	}
+	*gattr = NULL;
+
+	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+		goto out;
+
+	mod->sect_attrs = sect_attrs;
+	return;
+out:
+	free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+	if (mod->sect_attrs) {
+		sysfs_remove_group(&mod->mkobj.kobj,
+				   &mod->sect_attrs->grp);
+		/*
+		 * We are positive that no one is using any sect attrs
+		 * at this point.  Deallocate immediately.
+		 */
+		free_sect_attrs(mod->sect_attrs);
+		mod->sect_attrs = NULL;
+	}
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+	struct kobject *dir;
+	unsigned int notes;
+	struct bin_attribute attrs[];
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+				 struct bin_attribute *bin_attr,
+				 char *buf, loff_t pos, size_t count)
+{
+	/*
+	 * The caller checked the pos and count against our size.
+	 */
+	memcpy(buf, bin_attr->private + pos, count);
+	return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+			     unsigned int i)
+{
+	if (notes_attrs->dir) {
+		while (i-- > 0)
+			sysfs_remove_bin_file(notes_attrs->dir,
+					      &notes_attrs->attrs[i]);
+		kobject_put(notes_attrs->dir);
+	}
+	kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+	unsigned int notes, loaded, i;
+	struct module_notes_attrs *notes_attrs;
+	struct bin_attribute *nattr;
+
+	/* failed to create section attributes, so can't create notes */
+	if (!mod->sect_attrs)
+		return;
+
+	/* Count notes sections and allocate structures.  */
+	notes = 0;
+	for (i = 0; i < info->hdr->e_shnum; i++)
+		if (!sect_empty(&info->sechdrs[i]) &&
+		    info->sechdrs[i].sh_type == SHT_NOTE)
+			++notes;
+
+	if (notes == 0)
+		return;
+
+	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+			      GFP_KERNEL);
+	if (!notes_attrs)
+		return;
+
+	notes_attrs->notes = notes;
+	nattr = &notes_attrs->attrs[0];
+	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+		if (sect_empty(&info->sechdrs[i]))
+			continue;
+		if (info->sechdrs[i].sh_type == SHT_NOTE) {
+			sysfs_bin_attr_init(nattr);
+			nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+			nattr->attr.mode = 0444;
+			nattr->size = info->sechdrs[i].sh_size;
+			nattr->private = (void *)info->sechdrs[i].sh_addr;
+			nattr->read = module_notes_read;
+			++nattr;
+		}
+		++loaded;
+	}
+
+	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+	if (!notes_attrs->dir)
+		goto out;
+
+	for (i = 0; i < notes; ++i)
+		if (sysfs_create_bin_file(notes_attrs->dir,
+					  &notes_attrs->attrs[i]))
+			goto out;
+
+	mod->notes_attrs = notes_attrs;
+	return;
+
+out:
+	free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+	if (mod->notes_attrs)
+		free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list)
+		sysfs_remove_link(use->target->holders_dir, mod->name);
+	mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+	int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+	struct module_use *use;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(use, &mod->target_list, target_list) {
+		ret = sysfs_create_link(use->target->holders_dir,
+					&mod->mkobj.kobj, mod->name);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&module_mutex);
+	if (ret)
+		del_usage_links(mod);
+#endif
+	return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+		if (end >= 0 && i > end)
+			break;
+		/* pick a field to test for end of list */
+		if (!attr->attr.name)
+			break;
+		sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+		if (attr->free)
+			attr->free(mod);
+	}
+	kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	struct module_attribute *temp_attr;
+	int error = 0;
+	int i;
+
+	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+					(modinfo_attrs_count + 1)),
+					GFP_KERNEL);
+	if (!mod->modinfo_attrs)
+		return -ENOMEM;
+
+	temp_attr = mod->modinfo_attrs;
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (!attr->test || attr->test(mod)) {
+			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			sysfs_attr_init(&temp_attr->attr);
+			error = sysfs_create_file(&mod->mkobj.kobj,
+						  &temp_attr->attr);
+			if (error)
+				goto error_out;
+			++temp_attr;
+		}
+	}
+
+	return 0;
+
+error_out:
+	if (i > 0)
+		module_remove_modinfo_attrs(mod, --i);
+	else
+		kfree(mod->modinfo_attrs);
+	return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+	DECLARE_COMPLETION_ONSTACK(c);
+
+	mod->mkobj.kobj_completion = &c;
+	kobject_put(&mod->mkobj.kobj);
+	wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+	int err;
+	struct kobject *kobj;
+
+	if (!module_sysfs_initialized) {
+		pr_err("%s: module sysfs not initialized\n", mod->name);
+		err = -EINVAL;
+		goto out;
+	}
+
+	kobj = kset_find_obj(module_kset, mod->name);
+	if (kobj) {
+		pr_err("%s: module is already loaded\n", mod->name);
+		kobject_put(kobj);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mod->mkobj.mod = mod;
+
+	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+	mod->mkobj.kobj.kset = module_kset;
+	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+				   "%s", mod->name);
+	if (err)
+		mod_kobject_put(mod);
+
+out:
+	return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+		    const struct load_info *info,
+			   struct kernel_param *kparam,
+			   unsigned int num_params)
+{
+	int err;
+
+	err = mod_sysfs_init(mod);
+	if (err)
+		goto out;
+
+	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+	if (!mod->holders_dir) {
+		err = -ENOMEM;
+		goto out_unreg;
+	}
+
+	err = module_param_sysfs_setup(mod, kparam, num_params);
+	if (err)
+		goto out_unreg_holders;
+
+	err = module_add_modinfo_attrs(mod);
+	if (err)
+		goto out_unreg_param;
+
+	err = add_usage_links(mod);
+	if (err)
+		goto out_unreg_modinfo_attrs;
+
+	add_sect_attrs(mod, info);
+	add_notes_attrs(mod, info);
+
+	return 0;
+
+out_unreg_modinfo_attrs:
+	module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+	module_param_sysfs_remove(mod);
+out_unreg_holders:
+	kobject_put(mod->holders_dir);
+out_unreg:
+	mod_kobject_put(mod);
+out:
+	return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+	remove_notes_attrs(mod);
+	remove_sect_attrs(mod);
+	mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+	del_usage_links(mod);
+	module_remove_modinfo_attrs(mod, -1);
+	module_param_sysfs_remove(mod);
+	kobject_put(mod->mkobj.drivers_dir);
+	kobject_put(mod->holders_dir);
+	mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+	mutex_init(&mod->param_lock);
+}
-- 
cgit v1.2.3


From f64205a42046d3802c423fa2059e7fca39af127c Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:43 +0000
Subject: module: Move kdb module related code out of main kdb code

No functional change.

This patch migrates the kdb 'lsmod' command support out of main
kdb code into its own file under kernel/module. In addition to
the above, a minor style warning i.e. missing a blank line after
declarations, was resolved too. The new file was added to
MAINTAINERS. Finally we remove linux/module.h as it is entirely
redundant.

Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/debug/kdb/kdb_io.c       |  1 -
 kernel/debug/kdb/kdb_keyboard.c |  1 -
 kernel/debug/kdb/kdb_main.c     | 49 ------------------------------------
 kernel/debug/kdb/kdb_private.h  |  4 ---
 kernel/debug/kdb/kdb_support.c  |  1 -
 kernel/module/Makefile          |  1 +
 kernel/module/kdb.c             | 56 +++++++++++++++++++++++++++++++++++++++++
 kernel/module/main.c            |  4 ---
 8 files changed, 57 insertions(+), 60 deletions(-)
 create mode 100644 kernel/module/kdb.c

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6735ac36b718..67d3c48a1522 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
  * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
  */
 
-#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 #include <linux/kernel.h>
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index f877a0a0d7cf..f87c750d3eb3 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,7 +11,6 @@
 #include <linux/kdb.h>
 #include <linux/keyboard.h>
 #include <linux/ctype.h>
-#include <linux/module.h>
 #include <linux/io.h>
 
 /* Keyboard Controller Registers on normal PCs. */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0852a537dad4..f3a30cd5037f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -26,7 +26,6 @@
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
 #include <linux/atomic.h>
-#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/init.h>
@@ -2004,54 +2003,6 @@ static int kdb_ef(int argc, const char **argv)
 	return 0;
 }
 
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command.  Lists
- *	currently loaded kernel modules.
- *	Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
-	struct module *mod;
-
-	if (argc != 0)
-		return KDB_ARGCOUNT;
-
-	kdb_printf("Module                  Size  modstruct     Used by\n");
-	list_for_each_entry(mod, kdb_modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-
-		kdb_printf("%-20s%8u  0x%px ", mod->name,
-			   mod->core_layout.size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
-		kdb_printf("%4d ", module_refcount(mod));
-#endif
-		if (mod->state == MODULE_STATE_GOING)
-			kdb_printf(" (Unloading)");
-		else if (mod->state == MODULE_STATE_COMING)
-			kdb_printf(" (Loading)");
-		else
-			kdb_printf(" (Live)");
-		kdb_printf(" 0x%px", mod->core_layout.base);
-
-#ifdef CONFIG_MODULE_UNLOAD
-		{
-			struct module_use *use;
-			kdb_printf(" [ ");
-			list_for_each_entry(use, &mod->source_list,
-					    source_list)
-				kdb_printf("%s ", use->target->name);
-			kdb_printf("]\n");
-		}
-#endif
-	}
-
-	return 0;
-}
-
-#endif	/* CONFIG_MODULES */
-
 /*
  * kdb_env - This function implements the 'env' command.  Display the
  *	current environment variables.
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 0d2f9feea0a4..1f8c519a5f81 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -226,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void);
 #define kdb_kbd_cleanup_state()
 #endif /* ! CONFIG_KDB_KEYBOARD */
 
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
 extern char kdb_prompt_str[];
 
 #define	KDB_WORD_SIZE	((int)sizeof(unsigned long))
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 85cb51c4a17e..0a39497140bf 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -17,7 +17,6 @@
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
 #include <linux/ptrace.h>
-#include <linux/module.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
 #include <linux/delay.h>
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index cf8dcdc6b55f..88f5cdcdb067 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PROC_FS) += procfs.o
 obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..a446c699db0a
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command.  Lists
+ *	currently loaded kernel modules.
+ *	Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+	struct module *mod;
+
+	if (argc != 0)
+		return KDB_ARGCOUNT;
+
+	kdb_printf("Module                  Size  modstruct     Used by\n");
+	list_for_each_entry(mod, &modules, list) {
+		if (mod->state == MODULE_STATE_UNFORMED)
+			continue;
+
+		kdb_printf("%-20s%8u  0x%px ", mod->name,
+			   mod->core_layout.size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+		kdb_printf("%4d ", module_refcount(mod));
+#endif
+		if (mod->state == MODULE_STATE_GOING)
+			kdb_printf(" (Unloading)");
+		else if (mod->state == MODULE_STATE_COMING)
+			kdb_printf(" (Loading)");
+		else
+			kdb_printf(" (Live)");
+		kdb_printf(" 0x%px", mod->core_layout.base);
+
+#ifdef CONFIG_MODULE_UNLOAD
+		{
+			struct module_use *use;
+
+			kdb_printf(" [ ");
+			list_for_each_entry(use, &mod->source_list,
+					    source_list)
+				kdb_printf("%s ", use->target->name);
+			kdb_printf("]\n");
+		}
+#endif
+	}
+
+	return 0;
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 0cd0590dd411..a2dc54726621 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -108,10 +108,6 @@ static void mod_update_bounds(struct module *mod)
 		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
 }
 
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
 static void module_assert_mutex_or_preempt(void)
 {
 #ifdef CONFIG_LOCKDEP
-- 
cgit v1.2.3


From 47889798da4307ed78346f04c5d95c87abbf696b Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Tue, 22 Mar 2022 14:03:44 +0000
Subject: module: Move version support into a separate file

No functional change.

This patch migrates module version support out of core code into
kernel/module/version.c. In addition simple code refactoring to
make this possible.

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile   |   1 +
 kernel/module/internal.h |  48 +++++++++++++++
 kernel/module/main.c     | 156 +++--------------------------------------------
 kernel/module/version.c  | 109 +++++++++++++++++++++++++++++++++
 4 files changed, 166 insertions(+), 148 deletions(-)
 create mode 100644 kernel/module/version.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 88f5cdcdb067..e2eff9853a28 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PROC_FS) += procfs.o
 obj-$(CONFIG_SYSFS) += sysfs.o
 obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 62d749ef695e..3fc139d5074b 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -70,7 +70,27 @@ struct load_info {
 	} index;
 };
 
+enum mod_license {
+	NOT_GPL_ONLY,
+	GPL_ONLY,
+};
+
+struct find_symbol_arg {
+	/* Input */
+	const char *name;
+	bool gplok;
+	bool warn;
+
+	/* Output */
+	struct module *owner;
+	const s32 *crc;
+	const struct kernel_symbol *sym;
+	enum mod_license license;
+};
+
 int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
 struct module *find_module_all(const char *name, size_t len, bool even_unformed);
 int cmp_name(const void *name, const void *sym);
 long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
@@ -225,3 +245,31 @@ static inline int mod_sysfs_setup(struct module *mod,
 static inline void mod_sysfs_teardown(struct module *mod) { }
 static inline void init_param_lock(struct module *mod) { }
 #endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+		  const char *symname, struct module *mod, const s32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+		   struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+				const char *symname,
+				struct module *mod,
+				const s32 *crc)
+{
+	return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+					  struct module *mod)
+{
+	return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+	return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a2dc54726621..5f06c06df9e1 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -86,6 +86,12 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
 static unsigned long module_addr_min = -1UL, module_addr_max;
 #endif /* CONFIG_MODULES_TREE_LOOKUP */
 
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const s32 *crcs;
+	enum mod_license license;
+};
+
 /*
  * Bounds of module text, for speeding up __module_address.
  * Protected by module_mutex.
@@ -244,28 +250,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const s32 *crcs;
-	enum mod_license {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-	} license;
-};
-
-struct find_symbol_arg {
-	/* Input */
-	const char *name;
-	bool gplok;
-	bool warn;
-
-	/* Output */
-	struct module *owner;
-	const s32 *crc;
-	const struct kernel_symbol *sym;
-	enum mod_license license;
-};
-
 static bool check_exported_symbol(const struct symsearch *syms,
 				  struct module *owner,
 				  unsigned int symnum, void *data)
@@ -327,7 +311,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
  * Find an exported symbol and return it, along with, (optional) crc and
  * (optional) module which owns it.  Needs preempt disabled or module_mutex.
  */
-static bool find_symbol(struct find_symbol_arg *fsa)
+bool find_symbol(struct find_symbol_arg *fsa)
 {
 	static const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
@@ -1001,7 +985,7 @@ size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
 
 static const char vermagic[] = VERMAGIC_STRING;
 
-static int try_to_force_load(struct module *mod, const char *reason)
+int try_to_force_load(struct module *mod, const char *reason)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
 	if (!test_taint(TAINT_FORCED_MODULE))
@@ -1013,115 +997,6 @@ static int try_to_force_load(struct module *mod, const char *reason)
 #endif
 }
 
-#ifdef CONFIG_MODVERSIONS
-
-static u32 resolve_rel_crc(const s32 *crc)
-{
-	return *(u32 *)((void *)crc + *crc);
-}
-
-static int check_version(const struct load_info *info,
-			 const char *symname,
-			 struct module *mod,
-			 const s32 *crc)
-{
-	Elf_Shdr *sechdrs = info->sechdrs;
-	unsigned int versindex = info->index.vers;
-	unsigned int i, num_versions;
-	struct modversion_info *versions;
-
-	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
-	if (!crc)
-		return 1;
-
-	/* No versions at all?  modprobe --force does this. */
-	if (versindex == 0)
-		return try_to_force_load(mod, symname) == 0;
-
-	versions = (void *) sechdrs[versindex].sh_addr;
-	num_versions = sechdrs[versindex].sh_size
-		/ sizeof(struct modversion_info);
-
-	for (i = 0; i < num_versions; i++) {
-		u32 crcval;
-
-		if (strcmp(versions[i].name, symname) != 0)
-			continue;
-
-		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
-			crcval = resolve_rel_crc(crc);
-		else
-			crcval = *crc;
-		if (versions[i].crc == crcval)
-			return 1;
-		pr_debug("Found checksum %X vs module %lX\n",
-			 crcval, versions[i].crc);
-		goto bad_version;
-	}
-
-	/* Broken toolchain. Warn once, then let it go.. */
-	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
-	return 1;
-
-bad_version:
-	pr_warn("%s: disagrees about version of symbol %s\n",
-	       info->name, symname);
-	return 0;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
-					  struct module *mod)
-{
-	struct find_symbol_arg fsa = {
-		.name	= "module_layout",
-		.gplok	= true,
-	};
-
-	/*
-	 * Since this should be found in kernel (which can't be removed), no
-	 * locking is necessary -- use preempt_disable() to placate lockdep.
-	 */
-	preempt_disable();
-	if (!find_symbol(&fsa)) {
-		preempt_enable();
-		BUG();
-	}
-	preempt_enable();
-	return check_version(info, "module_layout", mod, fsa.crc);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
-			     bool has_crcs)
-{
-	if (has_crcs) {
-		amagic += strcspn(amagic, " ");
-		bmagic += strcspn(bmagic, " ");
-	}
-	return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(const struct load_info *info,
-				const char *symname,
-				struct module *mod,
-				const s32 *crc)
-{
-	return 1;
-}
-
-static inline int check_modstruct_version(const struct load_info *info,
-					  struct module *mod)
-{
-	return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
-			     bool has_crcs)
-{
-	return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
 static char *get_modinfo(const struct load_info *info, const char *tag);
 static char *get_next_modinfo(const struct load_info *info, const char *tag,
 			      char *prev);
@@ -3247,18 +3122,3 @@ void print_modules(void)
 		pr_cont(" [last unloaded: %s]", last_unloaded_module);
 	pr_cont("\n");
 }
-
-#ifdef CONFIG_MODVERSIONS
-/*
- * Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module.
- */
-void module_layout(struct module *mod,
-		   struct modversion_info *ver,
-		   struct kernel_param *kp,
-		   struct kernel_symbol *ks,
-		   struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..adaedce1dc97
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+static u32 resolve_rel_crc(const s32 *crc)
+{
+	return *(u32 *)((void *)crc + *crc);
+}
+
+int check_version(const struct load_info *info,
+		  const char *symname,
+			 struct module *mod,
+			 const s32 *crc)
+{
+	Elf_Shdr *sechdrs = info->sechdrs;
+	unsigned int versindex = info->index.vers;
+	unsigned int i, num_versions;
+	struct modversion_info *versions;
+
+	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
+	if (!crc)
+		return 1;
+
+	/* No versions at all?  modprobe --force does this. */
+	if (versindex == 0)
+		return try_to_force_load(mod, symname) == 0;
+
+	versions = (void *)sechdrs[versindex].sh_addr;
+	num_versions = sechdrs[versindex].sh_size
+		/ sizeof(struct modversion_info);
+
+	for (i = 0; i < num_versions; i++) {
+		u32 crcval;
+
+		if (strcmp(versions[i].name, symname) != 0)
+			continue;
+
+		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+			crcval = resolve_rel_crc(crc);
+		else
+			crcval = *crc;
+		if (versions[i].crc == crcval)
+			return 1;
+		pr_debug("Found checksum %X vs module %lX\n",
+			 crcval, versions[i].crc);
+		goto bad_version;
+	}
+
+	/* Broken toolchain. Warn once, then let it go.. */
+	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+	return 1;
+
+bad_version:
+	pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+	return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+			    struct module *mod)
+{
+	struct find_symbol_arg fsa = {
+		.name	= "module_layout",
+		.gplok	= true,
+	};
+
+	/*
+	 * Since this should be found in kernel (which can't be removed), no
+	 * locking is necessary -- use preempt_disable() to placate lockdep.
+	 */
+	preempt_disable();
+	if (!find_symbol(&fsa)) {
+		preempt_enable();
+		BUG();
+	}
+	preempt_enable();
+	return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+	       bool has_crcs)
+{
+	if (has_crcs) {
+		amagic += strcspn(amagic, " ");
+		bmagic += strcspn(bmagic, " ");
+	}
+	return strcmp(amagic, bmagic) == 0;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+		   struct modversion_info *ver,
+		   struct kernel_param *kp,
+		   struct kernel_symbol *ks,
+		   struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
-- 
cgit v1.2.3


From 0597579356fe5b6c0b99196e4743d4c2978f654a Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 10:00:58 +0100
Subject: module: Make module_enable_x() independent of
 CONFIG_ARCH_HAS_STRICT_MODULE_RWX

module_enable_x() has nothing to do with CONFIG_ARCH_HAS_STRICT_MODULE_RWX
allthough by coincidence architectures who need module_enable_x() are
selection CONFIG_ARCH_HAS_STRICT_MODULE_RWX.

Enable module_enable_x() for everyone everytime. If an architecture
already has module text set executable, it's a no-op.

Don't check text_size alignment. When CONFIG_STRICT_MODULE_RWX is set
the verification is already done in frob_rodata(). When
CONFIG_STRICT_MODULE_RWX is not set it is not a big deal to have the
start of data as executable. Just make sure we entirely get the last
page when the boundary is not aligned.

And don't BUG on misaligned base as some architectures like nios2
use kmalloc() for allocating modules. So just bail out in that case.
If that's a problem, a page fault will occur later anyway.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h |  6 ++----
 kernel/module/main.c     | 12 +++++-------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 3fc139d5074b..972bc811dcd2 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -23,9 +23,9 @@
 /*
  * Modules' sections will be aligned on page boundaries
  * to ensure complete separation of code and data, but
- * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
+ * only when CONFIG_STRICT_MODULE_RWX=y
  */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+#ifdef CONFIG_STRICT_MODULE_RWX
 # define debug_align(X) PAGE_ALIGN(X)
 #else
 # define debug_align(X) (X)
@@ -175,10 +175,8 @@ static inline struct module *mod_find(unsigned long addr)
 }
 #endif /* CONFIG_MODULES_TREE_LOOKUP */
 
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
 void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start,
 								     int num_pages));
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
 
 #ifdef CONFIG_STRICT_MODULE_RWX
 void module_enable_ro(const struct module *mod, bool after_init);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5f06c06df9e1..2bf8c617a901 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1144,24 +1144,22 @@ resolve_symbol_wait(struct module *mod,
  * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
  * whether we are strict.
  */
-#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
 void frob_text(const struct module_layout *layout,
 	       int (*set_memory)(unsigned long start, int num_pages))
 {
-	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
 	set_memory((unsigned long)layout->base,
-		   layout->text_size >> PAGE_SHIFT);
+		   PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
 }
 
 static void module_enable_x(const struct module *mod)
 {
+	if (!PAGE_ALIGNED(mod->core_layout.base) ||
+	    !PAGE_ALIGNED(mod->init_layout.base))
+		return;
+
 	frob_text(&mod->core_layout, set_memory_x);
 	frob_text(&mod->init_layout, set_memory_x);
 }
-#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
-static void module_enable_x(const struct module *mod) { }
-#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
 
 void __weak module_memfree(void *module_region)
 {
-- 
cgit v1.2.3


From 32a08c17d8096f0fd2c6600bc5fe8464aaf68ea7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 10:00:59 +0100
Subject: module: Move module_enable_x() and frob_text() in strict_rwx.c

Move module_enable_x() together with module_enable_nx() and
module_enable_ro().

Those three functions are going together, they are all used
to set up the correct page flags on the different sections.

As module_enable_x() is used independently of
CONFIG_STRICT_MODULE_RWX, build strict_rwx.c all the time and
use IS_ENABLED(CONFIG_STRICT_MODULE_RWX) when relevant.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile     |  3 +--
 kernel/module/internal.h   | 15 +--------------
 kernel/module/main.c       | 37 -----------------------------------
 kernel/module/strict_rwx.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index e2eff9853a28..d1ca799c12e2 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -7,12 +7,11 @@
 # and produce insane amounts of uninteresting coverage.
 KCOV_INSTRUMENT_module.o := n
 
-obj-y += main.o
+obj-y += main.o strict_rwx.o
 obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
 obj-$(CONFIG_MODULE_SIG) += signing.o
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
-obj-$(CONFIG_STRICT_MODULE_RWX) += strict_rwx.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PROC_FS) += procfs.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 972bc811dcd2..c59473b232df 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -175,25 +175,12 @@ static inline struct module *mod_find(unsigned long addr)
 }
 #endif /* CONFIG_MODULES_TREE_LOOKUP */
 
-void frob_text(const struct module_layout *layout, int (*set_memory)(unsigned long start,
-								     int num_pages));
-
-#ifdef CONFIG_STRICT_MODULE_RWX
 void module_enable_ro(const struct module *mod, bool after_init);
 void module_enable_nx(const struct module *mod);
+void module_enable_x(const struct module *mod);
 int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 				char *secstrings, struct module *mod);
 
-#else /* !CONFIG_STRICT_MODULE_RWX */
-static inline void module_enable_nx(const struct module *mod) { }
-static inline void module_enable_ro(const struct module *mod, bool after_init) {}
-static inline int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
-				       char *secstrings, struct module *mod)
-{
-	return 0;
-}
-#endif /* CONFIG_STRICT_MODULE_RWX */
-
 #ifdef CONFIG_MODULE_SIG
 int module_sig_check(struct load_info *info, int flags);
 #else /* !CONFIG_MODULE_SIG */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 2bf8c617a901..7175b44ba88a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1124,43 +1124,6 @@ resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- *          [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^                ^               ^               ^
- * ro_size ------------------------|               |               |
- * ro_after_init_size -----------------------------|               |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base)
- */
-
-/*
- * Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the
- * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of
- * whether we are strict.
- */
-void frob_text(const struct module_layout *layout,
-	       int (*set_memory)(unsigned long start, int num_pages))
-{
-	set_memory((unsigned long)layout->base,
-		   PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
-}
-
-static void module_enable_x(const struct module *mod)
-{
-	if (!PAGE_ALIGNED(mod->core_layout.base) ||
-	    !PAGE_ALIGNED(mod->init_layout.base))
-		return;
-
-	frob_text(&mod->core_layout, set_memory_x);
-	frob_text(&mod->init_layout, set_memory_x);
-}
-
 void __weak module_memfree(void *module_region)
 {
 	/*
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 7949dfd449c2..43332b4416b0 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -11,6 +11,34 @@
 #include <linux/set_memory.h>
 #include "internal.h"
 
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ *
+ * General layout of module is:
+ *          [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^                ^               ^               ^
+ * ro_size ------------------------|               |               |
+ * ro_after_init_size -----------------------------|               |
+ * size -----------------------------------------------------------|
+ *
+ * These values are always page-aligned (as is base) when
+ * CONFIG_STRICT_MODULE_RWX is set.
+ */
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+static void frob_text(const struct module_layout *layout,
+		      int (*set_memory)(unsigned long start, int num_pages))
+{
+	set_memory((unsigned long)layout->base,
+		   PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
+}
+
 static void frob_rodata(const struct module_layout *layout,
 		 int (*set_memory)(unsigned long start, int num_pages))
 {
@@ -41,10 +69,24 @@ static void frob_writable_data(const struct module_layout *layout,
 		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
 }
 
+void module_enable_x(const struct module *mod)
+{
+	if (!PAGE_ALIGNED(mod->core_layout.base) ||
+	    !PAGE_ALIGNED(mod->init_layout.base))
+		return;
+
+	frob_text(&mod->core_layout, set_memory_x);
+	frob_text(&mod->init_layout, set_memory_x);
+}
+
 void module_enable_ro(const struct module *mod, bool after_init)
 {
+	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		return;
+#ifdef CONFIG_STRICT_MODULE_RWX
 	if (!rodata_enabled)
 		return;
+#endif
 
 	set_vm_flush_reset_perms(mod->core_layout.base);
 	set_vm_flush_reset_perms(mod->init_layout.base);
@@ -60,6 +102,9 @@ void module_enable_ro(const struct module *mod, bool after_init)
 
 void module_enable_nx(const struct module *mod)
 {
+	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		return;
+
 	frob_rodata(&mod->core_layout, set_memory_nx);
 	frob_ro_after_init(&mod->core_layout, set_memory_nx);
 	frob_writable_data(&mod->core_layout, set_memory_nx);
@@ -73,6 +118,9 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 	const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
 	int i;
 
+	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		return 0;
+
 	for (i = 0; i < hdr->e_shnum; i++) {
 		if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
 			pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
-- 
cgit v1.2.3


From ef505058dc5524488a84423b4d5e8a7598a23a2e Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 10:01:00 +0100
Subject: module: Rework layout alignment to avoid BUG_ON()s

Perform layout alignment verification up front and WARN_ON()
and fail module loading instead of crashing the machine.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h   |  1 +
 kernel/module/main.c       |  5 +++++
 kernel/module/strict_rwx.c | 27 ++++++++++++++++++---------
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index c59473b232df..e94defbeda00 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -180,6 +180,7 @@ void module_enable_nx(const struct module *mod);
 void module_enable_x(const struct module *mod);
 int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 				char *secstrings, struct module *mod);
+bool module_check_misalignment(const struct module *mod);
 
 #ifdef CONFIG_MODULE_SIG
 int module_sig_check(struct load_info *info, int flags);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7175b44ba88a..55e710bc7f46 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2550,6 +2550,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	/* This relies on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 
+	if (module_check_misalignment(mod))
+		goto out_misaligned;
+
 	module_enable_ro(mod, false);
 	module_enable_nx(mod);
 	module_enable_x(mod);
@@ -2563,6 +2566,8 @@ static int complete_formation(struct module *mod, struct load_info *info)
 
 	return 0;
 
+out_misaligned:
+	err = -EINVAL;
 out:
 	mutex_unlock(&module_mutex);
 	return err;
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 43332b4416b0..f36ea54c1dac 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -42,9 +42,6 @@ static void frob_text(const struct module_layout *layout,
 static void frob_rodata(const struct module_layout *layout,
 		 int (*set_memory)(unsigned long start, int num_pages))
 {
-	BUG_ON(!PAGE_ALIGNED(layout->base));
-	BUG_ON(!PAGE_ALIGNED(layout->text_size));
-	BUG_ON(!PAGE_ALIGNED(layout->ro_size));
 	set_memory((unsigned long)layout->base + layout->text_size,
 		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
 }
@@ -52,9 +49,6 @@ static void frob_rodata(const struct module_layout *layout,
 static void frob_ro_after_init(const struct module_layout *layout,
 			int (*set_memory)(unsigned long start, int num_pages))
 {
-	BUG_ON(!PAGE_ALIGNED(layout->base));
-	BUG_ON(!PAGE_ALIGNED(layout->ro_size));
-	BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size));
 	set_memory((unsigned long)layout->base + layout->ro_size,
 		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
 }
@@ -62,13 +56,28 @@ static void frob_ro_after_init(const struct module_layout *layout,
 static void frob_writable_data(const struct module_layout *layout,
 			int (*set_memory)(unsigned long start, int num_pages))
 {
-	BUG_ON(!PAGE_ALIGNED(layout->base));
-	BUG_ON(!PAGE_ALIGNED(layout->ro_after_init_size));
-	BUG_ON(!PAGE_ALIGNED(layout->size));
 	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
 		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
 }
 
+static bool layout_check_misalignment(const struct module_layout *layout)
+{
+	return WARN_ON(!PAGE_ALIGNED(layout->base)) ||
+	       WARN_ON(!PAGE_ALIGNED(layout->text_size)) ||
+	       WARN_ON(!PAGE_ALIGNED(layout->ro_size)) ||
+	       WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) ||
+	       WARN_ON(!PAGE_ALIGNED(layout->size));
+}
+
+bool module_check_misalignment(const struct module *mod)
+{
+	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		return false;
+
+	return layout_check_misalignment(&mod->core_layout) ||
+	       layout_check_misalignment(&mod->init_layout);
+}
+
 void module_enable_x(const struct module *mod)
 {
 	if (!PAGE_ALIGNED(mod->core_layout.base) ||
-- 
cgit v1.2.3


From 7337f929d5672e37a80c8582d357f084320f475f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 10:01:01 +0100
Subject: module: Rename debug_align() as strict_align()

debug_align() was added by commit 84e1c6bb38eb ("x86: Add RO/NX
protection for loadable kernel modules")

At that time the config item was CONFIG_DEBUG_SET_MODULE_RONX.

But nowadays it has changed to CONFIG_STRICT_MODULE_RWX and
debug_align() is confusing because it has nothing to do with
DEBUG.

Rename it strict_align()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h |  4 ++--
 kernel/module/kallsyms.c |  4 ++--
 kernel/module/main.c     | 14 +++++++-------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index e94defbeda00..cbc268af23ae 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -26,9 +26,9 @@
  * only when CONFIG_STRICT_MODULE_RWX=y
  */
 #ifdef CONFIG_STRICT_MODULE_RWX
-# define debug_align(X) PAGE_ALIGN(X)
+# define strict_align(X) PAGE_ALIGN(X)
 #else
-# define debug_align(X) (X)
+# define strict_align(X) (X)
 #endif
 
 extern struct mutex module_mutex;
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index a3da0686a2a6..438492e103c8 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -139,7 +139,7 @@ void layout_symtab(struct module *mod, struct load_info *info)
 	mod->core_layout.size += strtab_size;
 	info->core_typeoffs = mod->core_layout.size;
 	mod->core_layout.size += ndst * sizeof(char);
-	mod->core_layout.size = debug_align(mod->core_layout.size);
+	mod->core_layout.size = strict_align(mod->core_layout.size);
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
@@ -154,7 +154,7 @@ void layout_symtab(struct module *mod, struct load_info *info)
 	mod->init_layout.size += sizeof(struct mod_kallsyms);
 	info->init_typeoffs = mod->init_layout.size;
 	mod->init_layout.size += nsrc * sizeof(char);
-	mod->init_layout.size = debug_align(mod->init_layout.size);
+	mod->init_layout.size = strict_align(mod->init_layout.size);
 }
 
 /*
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 55e710bc7f46..e6a72c3651f6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1447,19 +1447,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		}
 		switch (m) {
 		case 0: /* executable */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.size = strict_align(mod->core_layout.size);
 			mod->core_layout.text_size = mod->core_layout.size;
 			break;
 		case 1: /* RO: text and ro-data */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.size = strict_align(mod->core_layout.size);
 			mod->core_layout.ro_size = mod->core_layout.size;
 			break;
 		case 2: /* RO after init */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.size = strict_align(mod->core_layout.size);
 			mod->core_layout.ro_after_init_size = mod->core_layout.size;
 			break;
 		case 4: /* whole core */
-			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.size = strict_align(mod->core_layout.size);
 			break;
 		}
 	}
@@ -1481,11 +1481,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		}
 		switch (m) {
 		case 0: /* executable */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.size = strict_align(mod->init_layout.size);
 			mod->init_layout.text_size = mod->init_layout.size;
 			break;
 		case 1: /* RO: text and ro-data */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.size = strict_align(mod->init_layout.size);
 			mod->init_layout.ro_size = mod->init_layout.size;
 			break;
 		case 2:
@@ -1496,7 +1496,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
 			break;
 		case 4: /* whole init */
-			mod->init_layout.size = debug_align(mod->init_layout.size);
+			mod->init_layout.size = strict_align(mod->init_layout.size);
 			break;
 		}
 	}
-- 
cgit v1.2.3


From 80b8bf4369906aefbcb63a03012aed7a1abcbd18 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 13:02:11 +0100
Subject: module: Always have struct mod_tree_root

In order to separate text and data, we need to setup
two rb trees.

This means that struct mod_tree_root is required even without
MODULES_TREE_LOOKUP.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Aaron Tomlin <atomlin@atomlin.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h | 4 +++-
 kernel/module/main.c     | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index cbc268af23ae..ac64e53ac5e3 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -143,15 +143,17 @@ static inline void module_decompress_cleanup(struct load_info *info)
 }
 #endif
 
-#ifdef CONFIG_MODULES_TREE_LOOKUP
 struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
 	struct latch_tree_root root;
+#endif
 	unsigned long addr_min;
 	unsigned long addr_max;
 };
 
 extern struct mod_tree_root mod_tree;
 
+#ifdef CONFIG_MODULES_TREE_LOOKUP
 void mod_tree_insert(struct module *mod);
 void mod_tree_remove_init(struct module *mod);
 void mod_tree_remove(struct module *mod);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index e6a72c3651f6..84d567c57575 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -74,7 +74,6 @@ static void do_free_init(struct work_struct *w);
 static DECLARE_WORK(init_free_wq, do_free_init);
 static LLIST_HEAD(init_free_list);
 
-#ifdef CONFIG_MODULES_TREE_LOOKUP
 struct mod_tree_root mod_tree __cacheline_aligned = {
 	.addr_min = -1UL,
 };
@@ -82,10 +81,6 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
 #define module_addr_min mod_tree.addr_min
 #define module_addr_max mod_tree.addr_max
 
-#else /* !CONFIG_MODULES_TREE_LOOKUP */
-static unsigned long module_addr_min = -1UL, module_addr_max;
-#endif /* CONFIG_MODULES_TREE_LOOKUP */
-
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const s32 *crcs;
-- 
cgit v1.2.3


From 446d55666d5599ca58c1ceac25ce4b5191e70835 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 13:02:12 +0100
Subject: module: Prepare for handling several RB trees

In order to separate text and data, we need to setup
two rb trees.

Modify functions to give the tree as a parameter.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h    |  4 ++--
 kernel/module/main.c        | 16 ++++++++--------
 kernel/module/tree_lookup.c | 20 ++++++++++----------
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index ac64e53ac5e3..0f3146347256 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -157,13 +157,13 @@ extern struct mod_tree_root mod_tree;
 void mod_tree_insert(struct module *mod);
 void mod_tree_remove_init(struct module *mod);
 void mod_tree_remove(struct module *mod);
-struct module *mod_find(unsigned long addr);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
 #else /* !CONFIG_MODULES_TREE_LOOKUP */
 
 static inline void mod_tree_insert(struct module *mod) { }
 static inline void mod_tree_remove_init(struct module *mod) { }
 static inline void mod_tree_remove(struct module *mod) { }
-static inline struct module *mod_find(unsigned long addr)
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
 {
 	struct module *mod;
 
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 84d567c57575..392ac847d90a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -91,22 +91,22 @@ struct symsearch {
  * Bounds of module text, for speeding up __module_address.
  * Protected by module_mutex.
  */
-static void __mod_update_bounds(void *base, unsigned int size)
+static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree)
 {
 	unsigned long min = (unsigned long)base;
 	unsigned long max = min + size;
 
-	if (min < module_addr_min)
-		module_addr_min = min;
-	if (max > module_addr_max)
-		module_addr_max = max;
+	if (min < tree->addr_min)
+		tree->addr_min = min;
+	if (max > tree->addr_max)
+		tree->addr_max = max;
 }
 
 static void mod_update_bounds(struct module *mod)
 {
-	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree);
 	if (mod->init_layout.size)
-		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
+		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree);
 }
 
 static void module_assert_mutex_or_preempt(void)
@@ -3017,7 +3017,7 @@ struct module *__module_address(unsigned long addr)
 
 	module_assert_mutex_or_preempt();
 
-	mod = mod_find(addr);
+	mod = mod_find(addr, &mod_tree);
 	if (mod) {
 		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index 0bc4ec3b22ce..995fe68059db 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -61,14 +61,14 @@ static const struct latch_tree_ops mod_tree_ops = {
 	.comp = mod_tree_comp,
 };
 
-static noinline void __mod_tree_insert(struct mod_tree_node *node)
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
 {
-	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+	latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
 }
 
-static void __mod_tree_remove(struct mod_tree_node *node)
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
 {
-	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+	latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
 }
 
 /*
@@ -80,28 +80,28 @@ void mod_tree_insert(struct module *mod)
 	mod->core_layout.mtn.mod = mod;
 	mod->init_layout.mtn.mod = mod;
 
-	__mod_tree_insert(&mod->core_layout.mtn);
+	__mod_tree_insert(&mod->core_layout.mtn, &mod_tree);
 	if (mod->init_layout.size)
-		__mod_tree_insert(&mod->init_layout.mtn);
+		__mod_tree_insert(&mod->init_layout.mtn, &mod_tree);
 }
 
 void mod_tree_remove_init(struct module *mod)
 {
 	if (mod->init_layout.size)
-		__mod_tree_remove(&mod->init_layout.mtn);
+		__mod_tree_remove(&mod->init_layout.mtn, &mod_tree);
 }
 
 void mod_tree_remove(struct module *mod)
 {
-	__mod_tree_remove(&mod->core_layout.mtn);
+	__mod_tree_remove(&mod->core_layout.mtn, &mod_tree);
 	mod_tree_remove_init(mod);
 }
 
-struct module *mod_find(unsigned long addr)
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
 {
 	struct latch_tree_node *ltn;
 
-	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+	ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
 	if (!ltn)
 		return NULL;
 
-- 
cgit v1.2.3


From 6ab9942c44b2d213a16b2620e4baf0223122222f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 13:02:13 +0100
Subject: module: Introduce data_layout

In order to allow separation of data from text, add another layout,
called data_layout. For architectures requesting separation of text
and data, only text will go in core_layout and data will go in
data_layout.

For architectures which keep text and data together, make data_layout
an alias of core_layout, that way data_layout can be used for all
data manipulations, regardless of whether data is in core_layout or
data_layout.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h   |  2 ++
 kernel/module/kallsyms.c   | 18 +++++++++---------
 kernel/module/main.c       | 20 ++++++++++++--------
 kernel/module/strict_rwx.c | 10 +++++-----
 4 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 0f3146347256..0aabbf5cbcd1 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -20,6 +20,8 @@
 /* Maximum number of characters written by module_flags() */
 #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
 
+#define	data_layout core_layout
+
 /*
  * Modules' sections will be aligned on page boundaries
  * to ensure complete separation of code and data, but
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index 438492e103c8..3e11523bc6f6 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -134,12 +134,12 @@ void layout_symtab(struct module *mod, struct load_info *info)
 	}
 
 	/* Append room for core symbols at end of core part. */
-	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
-	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
-	mod->core_layout.size += strtab_size;
-	info->core_typeoffs = mod->core_layout.size;
-	mod->core_layout.size += ndst * sizeof(char);
-	mod->core_layout.size = strict_align(mod->core_layout.size);
+	info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1);
+	info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+	mod->data_layout.size += strtab_size;
+	info->core_typeoffs = mod->data_layout.size;
+	mod->data_layout.size += ndst * sizeof(char);
+	mod->data_layout.size = strict_align(mod->data_layout.size);
 
 	/* Put string table section at end of init part of module. */
 	strsect->sh_flags |= SHF_ALLOC;
@@ -187,9 +187,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
 	 * Now populate the cut down core kallsyms for after init
 	 * and set types up while we still have access to sections.
 	 */
-	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
-	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
-	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
+	mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs;
+	mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
+	mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
 	src = rcu_dereference_sched(mod->kallsyms)->symtab;
 	for (ndst = i = 0; i < rcu_dereference_sched(mod->kallsyms)->num_symtab; i++) {
 		rcu_dereference_sched(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 392ac847d90a..78658283408d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1190,7 +1190,7 @@ static void free_module(struct module *mod)
 	percpu_modfree(mod);
 
 	/* Free lock-classes; relies on the preceding sync_rcu(). */
-	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
 	module_memfree(mod->core_layout.base);
@@ -1431,13 +1431,15 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		for (i = 0; i < info->hdr->e_shnum; ++i) {
 			Elf_Shdr *s = &info->sechdrs[i];
 			const char *sname = info->secstrings + s->sh_name;
+			unsigned int *sizep;
 
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
 			    || module_init_layout_section(sname))
 				continue;
-			s->sh_entsize = module_get_offset(mod, &mod->core_layout.size, s, i);
+			sizep = m ? &mod->data_layout.size : &mod->core_layout.size;
+			s->sh_entsize = module_get_offset(mod, sizep, s, i);
 			pr_debug("\t%s\n", sname);
 		}
 		switch (m) {
@@ -1446,15 +1448,15 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->core_layout.text_size = mod->core_layout.size;
 			break;
 		case 1: /* RO: text and ro-data */
-			mod->core_layout.size = strict_align(mod->core_layout.size);
-			mod->core_layout.ro_size = mod->core_layout.size;
+			mod->data_layout.size = strict_align(mod->data_layout.size);
+			mod->data_layout.ro_size = mod->data_layout.size;
 			break;
 		case 2: /* RO after init */
-			mod->core_layout.size = strict_align(mod->core_layout.size);
-			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+			mod->data_layout.size = strict_align(mod->data_layout.size);
+			mod->data_layout.ro_after_init_size = mod->data_layout.size;
 			break;
 		case 4: /* whole core */
-			mod->core_layout.size = strict_align(mod->core_layout.size);
+			mod->data_layout.size = strict_align(mod->data_layout.size);
 			break;
 		}
 	}
@@ -2134,6 +2136,8 @@ static int move_module(struct module *mod, struct load_info *info)
 		if (shdr->sh_entsize & INIT_OFFSET_MASK)
 			dest = mod->init_layout.base
 				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
+		else if (!(shdr->sh_flags & SHF_EXECINSTR))
+			dest = mod->data_layout.base + shdr->sh_entsize;
 		else
 			dest = mod->core_layout.base + shdr->sh_entsize;
 
@@ -2829,7 +2833,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	mutex_unlock(&module_mutex);
  free_module:
 	/* Free lock-classes; relies on the preceding sync_rcu() */
-	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+	lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
 
 	module_deallocate(mod, info);
  free_copy:
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index f36ea54c1dac..fe3c10891407 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -101,12 +101,12 @@ void module_enable_ro(const struct module *mod, bool after_init)
 	set_vm_flush_reset_perms(mod->init_layout.base);
 	frob_text(&mod->core_layout, set_memory_ro);
 
-	frob_rodata(&mod->core_layout, set_memory_ro);
+	frob_rodata(&mod->data_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
 	frob_rodata(&mod->init_layout, set_memory_ro);
 
 	if (after_init)
-		frob_ro_after_init(&mod->core_layout, set_memory_ro);
+		frob_ro_after_init(&mod->data_layout, set_memory_ro);
 }
 
 void module_enable_nx(const struct module *mod)
@@ -114,9 +114,9 @@ void module_enable_nx(const struct module *mod)
 	if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
 		return;
 
-	frob_rodata(&mod->core_layout, set_memory_nx);
-	frob_ro_after_init(&mod->core_layout, set_memory_nx);
-	frob_writable_data(&mod->core_layout, set_memory_nx);
+	frob_rodata(&mod->data_layout, set_memory_nx);
+	frob_ro_after_init(&mod->data_layout, set_memory_nx);
+	frob_writable_data(&mod->data_layout, set_memory_nx);
 	frob_rodata(&mod->init_layout, set_memory_nx);
 	frob_writable_data(&mod->init_layout, set_memory_nx);
 }
-- 
cgit v1.2.3


From 01dc0386efb769056257410ba5754558384006a7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 13:02:14 +0100
Subject: module: Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC

Add CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC to allow architectures
to request having modules data in vmalloc area instead of module area.

This is required on powerpc book3s/32 in order to set data non
executable, because it is not possible to set executability on page
basis, this is done per 256 Mbytes segments. The module area has exec
right, vmalloc area has noexec.

This can also be useful on other powerpc/32 in order to maximize the
chance of code being close enough to kernel core to avoid branch
trampolines.

Cc: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mcgrof: rebased in light of kernel/module/kdb.c move]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h    |  3 +++
 kernel/module/kdb.c         | 10 ++++++--
 kernel/module/main.c        | 58 +++++++++++++++++++++++++++++++++++++++++++--
 kernel/module/procfs.c      |  8 +++++--
 kernel/module/strict_rwx.c  |  1 +
 kernel/module/tree_lookup.c |  8 +++++++
 6 files changed, 82 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 0aabbf5cbcd1..3e23bef5884d 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -20,7 +20,9 @@
 /* Maximum number of characters written by module_flags() */
 #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
 
+#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 #define	data_layout core_layout
+#endif
 
 /*
  * Modules' sections will be aligned on page boundaries
@@ -154,6 +156,7 @@ struct mod_tree_root {
 };
 
 extern struct mod_tree_root mod_tree;
+extern struct mod_tree_root mod_data_tree;
 
 #ifdef CONFIG_MODULES_TREE_LOOKUP
 void mod_tree_insert(struct module *mod);
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
index a446c699db0a..f4317f92e189 100644
--- a/kernel/module/kdb.c
+++ b/kernel/module/kdb.c
@@ -26,8 +26,11 @@ int kdb_lsmod(int argc, const char **argv)
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
 
-		kdb_printf("%-20s%8u  0x%px ", mod->name,
-			   mod->core_layout.size, (void *)mod);
+		kdb_printf("%-20s%8u", mod->name, mod->core_layout.size);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+		kdb_printf("/%8u", mod->data_layout.size);
+#endif
+		kdb_printf("  0x%px ", (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
 		kdb_printf("%4d ", module_refcount(mod));
 #endif
@@ -38,6 +41,9 @@ int kdb_lsmod(int argc, const char **argv)
 		else
 			kdb_printf(" (Live)");
 		kdb_printf(" 0x%px", mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+		kdb_printf("/0x%px", mod->data_layout.base);
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 		{
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 78658283408d..84b828431dcb 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -78,6 +78,12 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
 	.addr_min = -1UL,
 };
 
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+struct mod_tree_root mod_data_tree __cacheline_aligned = {
+	.addr_min = -1UL,
+};
+#endif
+
 #define module_addr_min mod_tree.addr_min
 #define module_addr_max mod_tree.addr_max
 
@@ -107,6 +113,9 @@ static void mod_update_bounds(struct module *mod)
 	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree);
 	if (mod->init_layout.size)
 		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	__mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree);
+#endif
 }
 
 static void module_assert_mutex_or_preempt(void)
@@ -940,6 +949,17 @@ static ssize_t show_coresize(struct module_attribute *mattr,
 static struct module_attribute modinfo_coresize =
 	__ATTR(coresize, 0444, show_coresize, NULL);
 
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(struct module_attribute *mattr,
+			     struct module_kobject *mk, char *buffer)
+{
+	return sprintf(buffer, "%u\n", mk->mod->data_layout.size);
+}
+
+static struct module_attribute modinfo_datasize =
+	__ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
 static ssize_t show_initsize(struct module_attribute *mattr,
 			     struct module_kobject *mk, char *buffer)
 {
@@ -968,6 +988,9 @@ struct module_attribute *modinfo_attrs[] = {
 	&modinfo_srcversion,
 	&modinfo_initstate,
 	&modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	&modinfo_datasize,
+#endif
 	&modinfo_initsize,
 	&modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
@@ -1194,6 +1217,9 @@ static void free_module(struct module *mod)
 
 	/* Finally, free the core (containing the module structure) */
 	module_memfree(mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	vfree(mod->data_layout.base);
+#endif
 }
 
 void *__symbol_get(const char *symbol)
@@ -2124,6 +2150,24 @@ static int move_module(struct module *mod, struct load_info *info)
 	} else
 		mod->init_layout.base = NULL;
 
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	/* Do the allocs. */
+	ptr = vmalloc(mod->data_layout.size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. Just mark it as not being a
+	 * leak.
+	 */
+	kmemleak_not_leak(ptr);
+	if (!ptr) {
+		module_memfree(mod->core_layout.base);
+		module_memfree(mod->init_layout.base);
+		return -ENOMEM;
+	}
+
+	memset(ptr, 0, mod->data_layout.size);
+	mod->data_layout.base = ptr;
+#endif
 	/* Transfer each section which specifies SHF_ALLOC */
 	pr_debug("final section addresses:\n");
 	for (i = 0; i < info->hdr->e_shnum; i++) {
@@ -2299,6 +2343,9 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
 	module_memfree(mod->core_layout.base);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	vfree(mod->data_layout.base);
+#endif
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3015,13 +3062,20 @@ bool is_module_address(unsigned long addr)
 struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
+	struct mod_tree_root *tree;
 
-	if (addr < module_addr_min || addr > module_addr_max)
+	if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+		tree = &mod_tree;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max)
+		tree = &mod_data_tree;
+#endif
+	else
 		return NULL;
 
 	module_assert_mutex_or_preempt();
 
-	mod = mod_find(addr, &mod_tree);
+	mod = mod_find(addr, tree);
 	if (mod) {
 		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
index 2717e130788e..9a8f4f0f6329 100644
--- a/kernel/module/procfs.c
+++ b/kernel/module/procfs.c
@@ -67,13 +67,17 @@ static int m_show(struct seq_file *m, void *p)
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
 	void *value;
+	unsigned int size;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
 		return 0;
 
-	seq_printf(m, "%s %u",
-		   mod->name, mod->init_layout.size + mod->core_layout.size);
+	size = mod->init_layout.size + mod->core_layout.size;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	size += mod->data_layout.size;
+#endif
+	seq_printf(m, "%s %u", mod->name, size);
 	print_unload_info(m, mod);
 
 	/* Informative for users. */
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index fe3c10891407..14fbea66f12f 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -75,6 +75,7 @@ bool module_check_misalignment(const struct module *mod)
 		return false;
 
 	return layout_check_misalignment(&mod->core_layout) ||
+	       layout_check_misalignment(&mod->data_layout) ||
 	       layout_check_misalignment(&mod->init_layout);
 }
 
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index 995fe68059db..8ec5cfd60496 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -83,6 +83,11 @@ void mod_tree_insert(struct module *mod)
 	__mod_tree_insert(&mod->core_layout.mtn, &mod_tree);
 	if (mod->init_layout.size)
 		__mod_tree_insert(&mod->init_layout.mtn, &mod_tree);
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	mod->data_layout.mtn.mod = mod;
+	__mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree);
+#endif
 }
 
 void mod_tree_remove_init(struct module *mod)
@@ -95,6 +100,9 @@ void mod_tree_remove(struct module *mod)
 {
 	__mod_tree_remove(&mod->core_layout.mtn, &mod_tree);
 	mod_tree_remove_init(mod);
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+	__mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree);
+#endif
 }
 
 struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
-- 
cgit v1.2.3


From 55ce556dbf92ec779b65336593d213ceef3f26f3 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Wed, 23 Feb 2022 13:02:15 +0100
Subject: module: Remove module_addr_min and module_addr_max

Replace module_addr_min and module_addr_max by
mod_tree.addr_min and mod_tree.addr_max

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 84b828431dcb..05a42d8fcd7a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -63,7 +63,7 @@
  * Mutex protects:
  * 1) List of modules (also safely readable with preempt_disable),
  * 2) module_use links,
- * 3) module_addr_min/module_addr_max.
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
  * (delete and add uses RCU list operations).
  */
 DEFINE_MUTEX(module_mutex);
@@ -2972,14 +2972,14 @@ static void cfi_init(struct module *mod)
 		mod->exit = *exit;
 #endif
 
-	cfi_module_add(mod, module_addr_min);
+	cfi_module_add(mod, mod_tree.addr_min);
 #endif
 }
 
 static void cfi_cleanup(struct module *mod)
 {
 #ifdef CONFIG_CFI_CLANG
-	cfi_module_remove(mod, module_addr_min);
+	cfi_module_remove(mod, mod_tree.addr_min);
 #endif
 }
 
-- 
cgit v1.2.3


From be77354a3d7ebd4897ee18eca26dca6df9224c76 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 19 Mar 2022 13:38:23 +0530
Subject: bpf: Do write access check for kfunc and global func

When passing pointer to some map value to kfunc or global func, in
verifier we are passing meta as NULL to various functions, which uses
meta->raw_mode to check whether memory is being written to. Since some
kfunc or global funcs may also write to memory pointers they receive as
arguments, we must check for write access to memory. E.g. in some case
map may be read only and this will be missed by current checks.

However meta->raw_mode allows for uninitialized memory (e.g. on stack),
since there is not enough info available through BTF, we must perform
one call for read access (raw_mode = false), and one for write access
(raw_mode = true).

Fixes: e5069b9c23b3 ("bpf: Support pointers in global func args")
Fixes: d583691c47dc ("bpf: Introduce mem, size argument pair support for kfunc")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220319080827.73251-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d175b70067b3..e9807e6e1090 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4919,8 +4919,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 	 * out. Only upper bounds can be learned because retval is an
 	 * int type and negative retvals are allowed.
 	 */
-	if (meta)
-		meta->msize_max_value = reg->umax_value;
+	meta->msize_max_value = reg->umax_value;
 
 	/* The register is SCALAR_VALUE; the access check
 	 * happens using its boundaries.
@@ -4963,24 +4962,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size)
 {
+	bool may_be_null = type_may_be_null(reg->type);
+	struct bpf_reg_state saved_reg;
+	struct bpf_call_arg_meta meta;
+	int err;
+
 	if (register_is_null(reg))
 		return 0;
 
-	if (type_may_be_null(reg->type)) {
-		/* Assuming that the register contains a value check if the memory
-		 * access is safe. Temporarily save and restore the register's state as
-		 * the conversion shouldn't be visible to a caller.
-		 */
-		const struct bpf_reg_state saved_reg = *reg;
-		int rv;
-
+	memset(&meta, 0, sizeof(meta));
+	/* Assuming that the register contains a value check if the memory
+	 * access is safe. Temporarily save and restore the register's state as
+	 * the conversion shouldn't be visible to a caller.
+	 */
+	if (may_be_null) {
+		saved_reg = *reg;
 		mark_ptr_not_null_reg(reg);
-		rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
-		*reg = saved_reg;
-		return rv;
 	}
 
-	return check_helper_mem_access(env, regno, mem_size, true, NULL);
+	err = check_helper_mem_access(env, regno, mem_size, true, &meta);
+	/* Check access for BPF_WRITE */
+	meta.raw_mode = true;
+	err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+
+	if (may_be_null)
+		*reg = saved_reg;
+
+	return err;
 }
 
 int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
@@ -4989,16 +4997,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
 	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
 	bool may_be_null = type_may_be_null(mem_reg->type);
 	struct bpf_reg_state saved_reg;
+	struct bpf_call_arg_meta meta;
 	int err;
 
 	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
 
+	memset(&meta, 0, sizeof(meta));
+
 	if (may_be_null) {
 		saved_reg = *mem_reg;
 		mark_ptr_not_null_reg(mem_reg);
 	}
 
-	err = check_mem_size_reg(env, reg, regno, true, NULL);
+	err = check_mem_size_reg(env, reg, regno, true, &meta);
+	/* Check access for BPF_WRITE */
+	meta.raw_mode = true;
+	err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
 
 	if (may_be_null)
 		*mem_reg = saved_reg;
-- 
cgit v1.2.3


From 97e6d7dab1ca4648821c790a2b7913d6d5d549db Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 19 Mar 2022 13:38:24 +0530
Subject: bpf: Check PTR_TO_MEM | MEM_RDONLY in check_helper_mem_access

The commit being fixed was aiming to disallow users from incorrectly
obtaining writable pointer to memory that is only meant to be read. This
is enforced now using a MEM_RDONLY flag.

For instance, in case of global percpu variables, when the BTF type is
not struct (e.g. bpf_prog_active), the verifier marks register type as
PTR_TO_MEM | MEM_RDONLY from bpf_this_cpu_ptr or bpf_per_cpu_ptr
helpers. However, when passing such pointer to kfunc, global funcs, or
BPF helpers, in check_helper_mem_access, there is no expectation
MEM_RDONLY flag will be set, hence it is checked as pointer to writable
memory. Later, verifier sets up argument type of global func as
PTR_TO_MEM | PTR_MAYBE_NULL, so user can use a global func to get around
the limitations imposed by this flag.

This check will also cover global non-percpu variables that may be
introduced in kernel BTF in future.

Also, we update the log message for PTR_TO_BUF case to be similar to
PTR_TO_MEM case, so that the reason for error is clear to user.

Fixes: 34d3a78c681e ("bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM.")
Reviewed-by: Hao Luo <haoluo@google.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220319080827.73251-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9807e6e1090..d953e62b5268 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4871,13 +4871,23 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
 	case PTR_TO_MEM:
+		if (type_is_rdonly_mem(reg->type)) {
+			if (meta && meta->raw_mode) {
+				verbose(env, "R%d cannot write into %s\n", regno,
+					reg_type_str(env, reg->type));
+				return -EACCES;
+			}
+		}
 		return check_mem_region_access(env, regno, reg->off,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
 	case PTR_TO_BUF:
 		if (type_is_rdonly_mem(reg->type)) {
-			if (meta && meta->raw_mode)
+			if (meta && meta->raw_mode) {
+				verbose(env, "R%d cannot write into %s\n", regno,
+					reg_type_str(env, reg->type));
 				return -EACCES;
+			}
 
 			max_access = &env->prog->aux->max_rdonly_access;
 		} else {
-- 
cgit v1.2.3


From 7b3552d3f9f6897851fc453b5131a967167e43c2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 19 Mar 2022 13:38:25 +0530
Subject: bpf: Reject writes for PTR_TO_MAP_KEY in check_helper_mem_access

It is not permitted to write to PTR_TO_MAP_KEY, but the current code in
check_helper_mem_access would allow for it, reject this case as well, as
helpers taking ARG_PTR_TO_UNINIT_MEM also take PTR_TO_MAP_KEY.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220319080827.73251-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d953e62b5268..9c1a02b82ecd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4861,6 +4861,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
 	case PTR_TO_MAP_KEY:
+		if (meta && meta->raw_mode) {
+			verbose(env, "R%d cannot write into %s\n", regno,
+				reg_type_str(env, reg->type));
+			return -EACCES;
+		}
 		return check_mem_region_access(env, regno, reg->off, access_size,
 					       reg->map_ptr->key_size, false);
 	case PTR_TO_MAP_VALUE:
-- 
cgit v1.2.3


From 9b5a7f4a2a8dcda461f9c7a6671150f4a8a902e8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 31 Mar 2022 10:57:28 -0700
Subject: x86/configs: Add x86 debugging Kconfig fragment plus docs

The kernel has a wide variety of debugging options to help catch
and squash bugs.  However, new debugging is added all the time and
the existing options can be hard to find.

Add a Kconfig fragment with the debugging options which tip
maintainers expect to be used to test contributions.

This should make it easier for contributors to test their code and
find issues before submission.

  [ bp: Add to "make help" output, fix DEBUG_INFO selection as pointed
        out by Nathan Chancellor <nathan@kernel.org>. ]

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220331175728.299103A0@davehans-spike.ostc.intel.com
---
 kernel/configs/x86_debug.config | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 kernel/configs/x86_debug.config

(limited to 'kernel')

diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..dcd86f32f4ed
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,18 @@
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_KMEMCHECK=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_VMLINUX_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
-- 
cgit v1.2.3


From a60707d74bd1d119cf7bcc5101cda912fc46d5e3 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:45:57 +0800
Subject: sched: Move child_runs_first sysctls to fair.c

move child_runs_first sysctls to fair.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/fair.c  | 19 +++++++++++++++++++
 kernel/sched/sched.h |  2 ++
 kernel/sysctl.c      |  7 -------
 3 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bd299d67ab..788b1d6a3248 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -109,6 +109,25 @@ static unsigned int sched_nr_latency = 8;
  * parent will (try to) run first.
  */
 unsigned int sysctl_sched_child_runs_first __read_mostly;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_child_runs_first_sysctls[] = {
+	{
+		.procname       = "sched_child_runs_first",
+		.data           = &sysctl_sched_child_runs_first,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{}
+};
+
+static int __init sched_child_runs_first_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_child_runs_first_sysctls);
+	return 0;
+}
+late_initcall(sched_child_runs_first_sysctl_init);
+#endif
 
 /*
  * SCHED_OTHER wake-up granularity.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..767fc1de9646 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -108,6 +108,8 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
+extern unsigned int sysctl_sched_child_runs_first;
+
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 830aaf8ca08e..6bbb8e1af675 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1659,13 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
 }
 
 static struct ctl_table kern_table[] = {
-	{
-		.procname	= "sched_child_runs_first",
-		.data		= &sysctl_sched_child_runs_first,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",
-- 
cgit v1.2.3


From f5ef06d58be8311a9425e6a54a053ecb350952f3 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:45:58 +0800
Subject: sched: Move schedstats sysctls to core.c

move schedstats sysctls to core.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/core.c | 22 +++++++++++++++++++++-
 kernel/sysctl.c     | 11 -----------
 2 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..04440da5955f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4430,7 +4430,7 @@ out:
 __setup("schedstats=", setup_schedstats);
 
 #ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
@@ -4449,6 +4449,26 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 		set_schedstats(state);
 	return err;
 }
+
+static struct ctl_table sched_schedstats_sysctls[] = {
+	{
+		.procname       = "sched_schedstats",
+		.data           = NULL,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_schedstats,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{}
+};
+
+static int __init sched_schedstats_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_schedstats_sysctls);
+	return 0;
+}
+late_initcall(sched_schedstats_sysctl_init);
 #endif /* CONFIG_PROC_SYSCTL */
 #endif /* CONFIG_SCHEDSTATS */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6bbb8e1af675..fc0eeca20718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1659,17 +1659,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
 }
 
 static struct ctl_table kern_table[] = {
-#ifdef CONFIG_SCHEDSTATS
-	{
-		.procname	= "sched_schedstats",
-		.data		= NULL,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_schedstats,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_TASK_DELAY_ACCT
 	{
 		.procname	= "task_delayacct",
-- 
cgit v1.2.3


From d9ab0e63fa7f8405fbb19e28c5191e0880a7f2db Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:45:59 +0800
Subject: sched: Move rt_period/runtime sysctls to rt.c

move rt_period/runtime sysctls to rt.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/rcu/rcu.h     |  2 ++
 kernel/sched/core.c  | 13 -------------
 kernel/sched/rt.c    | 43 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |  4 ++++
 kernel/sysctl.c      | 14 --------------
 5 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..7812c740b3bf 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -23,6 +23,8 @@
 #define RCU_SEQ_CTR_SHIFT	2
 #define RCU_SEQ_STATE_MASK	((1 << RCU_SEQ_CTR_SHIFT) - 1)
 
+extern int sysctl_sched_rt_runtime;
+
 /*
  * Return the counter portion of a sequence number previously returned
  * by rcu_seq_snap() or rcu_seq_current().
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 04440da5955f..774f3229db37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -145,12 +145,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8;
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 #endif
 
-/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
-
 __read_mostly int scheduler_running;
 
 #ifdef CONFIG_SCHED_CORE
@@ -444,13 +438,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
-/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-
-
 /*
  * Serialization rules:
  *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a32c46889af8..5663bb5ff890 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -13,6 +13,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
 struct rt_bandwidth def_rt_bandwidth;
 
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_rt_sysctls[] = {
+	{
+		.procname       = "sched_rt_period_us",
+		.data           = &sysctl_sched_rt_period,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_rt_handler,
+	},
+	{
+		.procname       = "sched_rt_runtime_us",
+		.data           = &sysctl_sched_rt_runtime,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = sched_rt_handler,
+	},
+	{}
+};
+
+static int __init sched_rt_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_rt_sysctls);
+	return 0;
+}
+late_initcall(sched_rt_sysctl_init);
+#endif
+
 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
@@ -2925,7 +2966,7 @@ static void sched_rt_do_global(void)
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 }
 
-int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	int old_period, old_runtime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 767fc1de9646..3b406c78a8e9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -114,6 +114,10 @@ extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
 extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fc0eeca20718..029bfe06c68d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-	{
-		.procname	= "sched_rt_period_us",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
-	{
-		.procname	= "sched_rt_runtime_us",
-		.data		= &sysctl_sched_rt_runtime,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sched_rt_handler,
-	},
 	{
 		.procname	= "sched_deadline_period_max_us",
 		.data		= &sysctl_sched_dl_period_max,
-- 
cgit v1.2.3


From 84227c12888b1105725cd2de14705b029bcbb4b2 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:46:00 +0800
Subject: sched: Move deadline_period sysctls to deadline.c

move deadline_period sysctls to deadline.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/deadline.c | 42 ++++++++++++++++++++++++++++++++++--------
 kernel/sysctl.c         | 14 --------------
 2 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb4255ae0b2c..82e10b74a7b2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,40 @@
  *                    Fabio Checconi <fchecconi@gmail.com>
  */
 
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_dl_sysctls[] = {
+	{
+		.procname       = "sched_deadline_period_max_us",
+		.data           = &sysctl_sched_dl_period_max,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname       = "sched_deadline_period_min_us",
+		.data           = &sysctl_sched_dl_period_min,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{}
+};
+
+static int __init sched_dl_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_dl_sysctls);
+	return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
 	return container_of(dl_se, struct task_struct, dl);
@@ -2879,14 +2913,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 	attr->sched_flags |= dl_se->flags;
 }
 
-/*
- * Default limits for DL period; on the top end we guard against small util
- * tasks still getting ridiculously long effective runtimes, on the bottom end we
- * guard against timer DoS.
- */
-unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
-unsigned int sysctl_sched_dl_period_min = 100;     /* 100 us */
-
 /*
  * This function validates the new parameters of a -deadline task.
  * We ask for the deadline not being zero, and greater or equal
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 029bfe06c68d..2c8c75e11a37 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,20 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-	{
-		.procname	= "sched_deadline_period_max_us",
-		.data		= &sysctl_sched_dl_period_max,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_deadline_period_min_us",
-		.data		= &sysctl_sched_dl_period_min,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "sched_rr_timeslice_ms",
 		.data		= &sysctl_sched_rr_timeslice,
-- 
cgit v1.2.3


From dafd7a9dad22fadcb290b24dff54e2eae3b89776 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:46:01 +0800
Subject: sched: Move rr_timeslice sysctls to rt.c

move rr_timeslice sysctls to rt.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/rt.c    | 13 +++++++++++--
 kernel/sched/sched.h |  1 +
 kernel/sysctl.c      |  7 -------
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5663bb5ff890..71791be36065 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,7 +5,7 @@
  */
 
 int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 /* More than 4 hours if BW_SHIFT equals 20. */
 static const u64 max_rt_runtime = MAX_BW;
 
@@ -27,6 +27,8 @@ int sysctl_sched_rt_runtime = 950000;
 
 static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_rt_sysctls[] = {
 	{
@@ -43,6 +45,13 @@ static struct ctl_table sched_rt_sysctls[] = {
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
 	},
+	{
+		.procname       = "sched_rr_timeslice_ms",
+		.data           = &sysctl_sched_rr_timeslice,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = sched_rr_handler,
+	},
 	{}
 };
 
@@ -3005,7 +3014,7 @@ undo:
 	return ret;
 }
 
-int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	int ret;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3b406c78a8e9..ae0f6e5a76f9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -117,6 +117,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
+extern int sched_rr_timeslice;
 
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2c8c75e11a37..b074f70a3e11 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,13 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-	{
-		.procname	= "sched_rr_timeslice_ms",
-		.data		= &sysctl_sched_rr_timeslice,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sched_rr_handler,
-	},
 #ifdef CONFIG_UCLAMP_TASK
 	{
 		.procname	= "sched_util_clamp_min",
-- 
cgit v1.2.3


From 28f152cd0926596e69d412467b11b6fe6fe4e864 Mon Sep 17 00:00:00 2001
From: Baisong Zhong <zhongbaisong@huawei.com>
Date: Fri, 18 Mar 2022 10:54:17 +0800
Subject: sched/rt: fix build error when CONFIG_SYSCTL is disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid random build errors which do not select
CONFIG_SYSCTL by depending on it in Kconfig.

This fixes the following warning:

In file included from kernel/sched/build_policy.c:43:
At top level:
kernel/sched/rt.c:3017:12: error: ‘sched_rr_handler’ defined but not used [-Werror=unused-function]
 3017 | static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
      |            ^~~~~~~~~~~~~~~~
kernel/sched/rt.c:2978:12: error: ‘sched_rt_handler’ defined but not used [-Werror=unused-function]
 2978 | static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
      |            ^~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors
make[2]: *** [scripts/Makefile.build:310: kernel/sched/build_policy.o] Error 1
make[1]: *** [scripts/Makefile.build:638: kernel/sched] Error 2
make[1]: *** Waiting for unfinished jobs....

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Baisong Zhong <zhongbaisong@huawei.com>
[mcgrof: small build fix, we need sched_rt_can_attach() even
 when CONFIG_SYSCTL is disabled]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/rt.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 71791be36065..b491a0f8c25d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,7 +5,6 @@
  */
 
 int sched_rr_timeslice = RR_TIMESLICE;
-static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 /* More than 4 hours if BW_SHIFT equals 20. */
 static const u64 max_rt_runtime = MAX_BW;
 
@@ -25,11 +24,12 @@ unsigned int sysctl_sched_rt_period = 1000000;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+#ifdef CONFIG_SYSCTL
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
 static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_SYSCTL
 static struct ctl_table sched_rt_sysctls[] = {
 	{
 		.procname       = "sched_rt_period_us",
@@ -2911,6 +2911,7 @@ long sched_group_rt_period(struct task_group *tg)
 	return rt_period_us;
 }
 
+#ifdef CONFIG_SYSCTL
 static int sched_rt_global_constraints(void)
 {
 	int ret = 0;
@@ -2921,6 +2922,7 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
+#endif /* CONFIG_SYSCTL */
 
 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
@@ -2932,6 +2934,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 }
 
 #else /* !CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SYSCTL
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
@@ -2949,8 +2953,10 @@ static int sched_rt_global_constraints(void)
 
 	return 0;
 }
+#endif /* CONFIG_SYSCTL */
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_SYSCTL
 static int sched_rt_global_validate(void)
 {
 	if (sysctl_sched_rt_period <= 0)
@@ -3035,6 +3041,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
 
 	return ret;
 }
+#endif /* CONFIG_SYSCTL */
 
 #ifdef CONFIG_SCHED_DEBUG
 void print_rt_stats(struct seq_file *m, int cpu)
-- 
cgit v1.2.3


From 3267e0156c3341ac25b37a0f60551cdae1634b60 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:46:02 +0800
Subject: sched: Move uclamp_util sysctls to core.c

move uclamp_util sysctls to core.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/core.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 kernel/sysctl.c     | 23 -----------------------
 2 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 774f3229db37..ef31751c5799 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1306,10 +1306,10 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 static DEFINE_MUTEX(uclamp_mutex);
 
 /* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
 
 /* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
 
 /*
  * By default RT tasks run at the maximum performance point/capacity of the
@@ -1326,7 +1326,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
  * This knob will not override the system default sched_util_clamp_min defined
  * above.
  */
-unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
 
 /* All clamps are required to be less or equal than these values */
 static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1779,7 +1779,7 @@ static void uclamp_update_root_tg(void)
 static void uclamp_update_root_tg(void) { }
 #endif
 
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
 	bool update_root_tg = false;
@@ -4436,8 +4436,12 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 		set_schedstats(state);
 	return err;
 }
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_SCHEDSTATS */
 
-static struct ctl_table sched_schedstats_sysctls[] = {
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
 	{
 		.procname       = "sched_schedstats",
 		.data           = NULL,
@@ -4447,17 +4451,39 @@ static struct ctl_table sched_schedstats_sysctls[] = {
 		.extra1         = SYSCTL_ZERO,
 		.extra2         = SYSCTL_ONE,
 	},
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+	{
+		.procname       = "sched_util_clamp_min",
+		.data           = &sysctl_sched_uclamp_util_min,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_sched_uclamp_handler,
+	},
+	{
+		.procname       = "sched_util_clamp_max",
+		.data           = &sysctl_sched_uclamp_util_max,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_sched_uclamp_handler,
+	},
+	{
+		.procname       = "sched_util_clamp_min_rt_default",
+		.data           = &sysctl_sched_uclamp_util_min_rt_default,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_sched_uclamp_handler,
+	},
+#endif /* CONFIG_UCLAMP_TASK */
 	{}
 };
-
-static int __init sched_schedstats_sysctl_init(void)
+static int __init sched_core_sysctl_init(void)
 {
-	register_sysctl_init("kernel", sched_schedstats_sysctls);
+	register_sysctl_init("kernel", sched_core_sysctls);
 	return 0;
 }
-late_initcall(sched_schedstats_sysctl_init);
-#endif /* CONFIG_PROC_SYSCTL */
-#endif /* CONFIG_SCHEDSTATS */
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
 
 /*
  * fork()/clone()-time setup:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b074f70a3e11..a48c090d57f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,29 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_UCLAMP_TASK
-	{
-		.procname	= "sched_util_clamp_min",
-		.data		= &sysctl_sched_uclamp_util_min,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_sched_uclamp_handler,
-	},
-	{
-		.procname	= "sched_util_clamp_max",
-		.data		= &sysctl_sched_uclamp_util_max,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_sched_uclamp_handler,
-	},
-	{
-		.procname	= "sched_util_clamp_min_rt_default",
-		.data		= &sysctl_sched_uclamp_util_min_rt_default,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_sched_uclamp_handler,
-	},
-#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.procname	= "sched_cfs_bandwidth_slice_us",
-- 
cgit v1.2.3


From d4ae80ffa64f87b9c355692b680b603add084e96 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:46:03 +0800
Subject: sched: Move cfs_bandwidth_slice sysctls to fair.c

move cfs_bandwidth_slice sysctls to fair.c and use the
new register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++--------------------
 kernel/sysctl.c     | 10 ----------
 2 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 788b1d6a3248..265bf7a75a37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -109,25 +109,6 @@ static unsigned int sched_nr_latency = 8;
  * parent will (try to) run first.
  */
 unsigned int sysctl_sched_child_runs_first __read_mostly;
-#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_child_runs_first_sysctls[] = {
-	{
-		.procname       = "sched_child_runs_first",
-		.data           = &sysctl_sched_child_runs_first,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
-	{}
-};
-
-static int __init sched_child_runs_first_sysctl_init(void)
-{
-	register_sysctl_init("kernel", sched_child_runs_first_sysctls);
-	return 0;
-}
-late_initcall(sched_child_runs_first_sysctl_init);
-#endif
 
 /*
  * SCHED_OTHER wake-up granularity.
@@ -192,7 +173,37 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
-unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_fair_sysctls[] = {
+	{
+		.procname       = "sched_child_runs_first",
+		.data           = &sysctl_sched_child_runs_first,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.procname       = "sched_cfs_bandwidth_slice_us",
+		.data           = &sysctl_sched_cfs_bandwidth_slice,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ONE,
+	},
+#endif
+	{}
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_fair_sysctls);
+	return 0;
+}
+late_initcall(sched_fair_sysctl_init);
 #endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a48c090d57f9..10ab81c7c457 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,16 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_CFS_BANDWIDTH
-	{
-		.procname	= "sched_cfs_bandwidth_slice_us",
-		.data		= &sysctl_sched_cfs_bandwidth_slice,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-	},
-#endif
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 	{
 		.procname	= "sched_energy_aware",
-- 
cgit v1.2.3


From 8a0441415b3f9b9a920a6a5086580ea3daa7b884 Mon Sep 17 00:00:00 2001
From: Zhen Ni <nizhen@uniontech.com>
Date: Tue, 15 Feb 2022 19:46:04 +0800
Subject: sched: Move energy_aware sysctls to topology.c

move energy_aware sysctls to topology.c and use the new
register_sysctl_init() to register the sysctl interface.

Signed-off-by: Zhen Ni <nizhen@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/topology.c | 25 +++++++++++++++++++++++--
 kernel/sysctl.c         | 11 -----------
 2 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 810750e62118..05b6c2ad90b9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -206,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-unsigned int sysctl_sched_energy_aware = 1;
+static unsigned int sysctl_sched_energy_aware = 1;
 DEFINE_MUTEX(sched_energy_mutex);
 bool sched_energy_update;
 
@@ -220,7 +220,7 @@ void rebuild_sched_domains_energy(void)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-int sched_energy_aware_handler(struct ctl_table *table, int write,
+static int sched_energy_aware_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret, state;
@@ -237,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
 
 	return ret;
 }
+
+static struct ctl_table sched_energy_aware_sysctls[] = {
+	{
+		.procname       = "sched_energy_aware",
+		.data           = &sysctl_sched_energy_aware,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_energy_aware_handler,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{}
+};
+
+static int __init sched_energy_aware_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_energy_aware_sysctls);
+	return 0;
+}
+
+late_initcall(sched_energy_aware_sysctl_init);
 #endif
 
 static void free_pd(struct perf_domain *pd)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 10ab81c7c457..8241c5401ee8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1681,17 +1681,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-	{
-		.procname	= "sched_energy_aware",
-		.data		= &sysctl_sched_energy_aware,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_energy_aware_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
cgit v1.2.3


From 06d177662fb86b80c7fc2290667b9a14cb0bd925 Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Thu, 17 Feb 2022 12:23:21 +0800
Subject: kernel/reboot: move reboot sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the poweroff_cmd and ctrl-alt-del
sysctls to its own file, kernel/reboot.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/reboot.c | 34 ++++++++++++++++++++++++++++++++--
 kernel/sysctl.c | 14 --------------
 2 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6bcc5d6a6572..ed4e6dfb7d44 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -23,7 +23,7 @@
  * this indicates whether you can reboot with ctrl-alt-del: the default is yes
  */
 
-int C_A_D = 1;
+static int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
@@ -417,9 +417,37 @@ void ctrl_alt_del(void)
 		kill_cad_pid(SIGINT, 1);
 }
 
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+#define POWEROFF_CMD_PATH_LEN  256
+static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
 static const char reboot_cmd[] = "/sbin/reboot";
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_reboot_table[] = {
+	{
+		.procname       = "poweroff_cmd",
+		.data           = &poweroff_cmd,
+		.maxlen         = POWEROFF_CMD_PATH_LEN,
+		.mode           = 0644,
+		.proc_handler   = proc_dostring,
+	},
+	{
+		.procname       = "ctrl-alt-del",
+		.data           = &C_A_D,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
 static int run_cmd(const char *cmd)
 {
 	char **argv;
@@ -886,6 +914,8 @@ static int __init reboot_ksysfs_init(void)
 		return ret;
 	}
 
+	kernel_reboot_sysctls_init();
+
 	return 0;
 }
 late_initcall(reboot_ksysfs_init);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8241c5401ee8..5e43569ce2be 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1798,13 +1798,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-	{
-		.procname	= "ctrl-alt-del",
-		.data		= &C_A_D,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_FUNCTION_TRACER
 	{
 		.procname	= "ftrace_enabled",
@@ -2111,13 +2104,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-	{
-		.procname	= "poweroff_cmd",
-		.data		= &poweroff_cmd,
-		.maxlen		= POWEROFF_CMD_PATH_LEN,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring,
-	},
 #ifdef CONFIG_KEYS
 	{
 		.procname	= "keys",
-- 
cgit v1.2.3


From 43fe219aa56a2fdd8f0623c9470a32b14b0617a5 Mon Sep 17 00:00:00 2001
From: sujiaxun <sujiaxun@uniontech.com>
Date: Thu, 17 Feb 2022 18:51:48 -0800
Subject: mm: move oom_kill sysctls to their own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we just
care about the core logic.

So move the oom_kill sysctls to their own file, mm/oom_kill.c

[sfr@canb.auug.org.au: null-terminate the array]
  Link: https://lkml.kernel.org/r/20220216193202.28838626@canb.auug.org.au

Link: https://lkml.kernel.org/r/20220215093203.31032-1-sujiaxun@uniontech.com
Signed-off-by: sujiaxun <sujiaxun@uniontech.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sysctl.c | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5e43569ce2be..a21c0ea396f3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2241,29 +2241,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_TWO,
 	},
-	{
-		.procname	= "panic_on_oom",
-		.data		= &sysctl_panic_on_oom,
-		.maxlen		= sizeof(sysctl_panic_on_oom),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_TWO,
-	},
-	{
-		.procname	= "oom_kill_allocating_task",
-		.data		= &sysctl_oom_kill_allocating_task,
-		.maxlen		= sizeof(sysctl_oom_kill_allocating_task),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "oom_dump_tasks",
-		.data		= &sysctl_oom_dump_tasks,
-		.maxlen		= sizeof(sysctl_oom_dump_tasks),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
-- 
cgit v1.2.3


From aa779e5102195e1d9ade95dcbc0bfbd8f916eb59 Mon Sep 17 00:00:00 2001
From: zhanglianjie <zhanglianjie@uniontech.com>
Date: Thu, 17 Feb 2022 18:51:51 -0800
Subject: mm: move page-writeback sysctls to their own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we just
care about the core logic.

So move the page-writeback sysctls to its own file.

[akpm@linux-foundation.org: coding-style cleanups]

akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warnings]
Link: https://lkml.kernel.org/r/20220129012955.26594-1-zhanglianjie@uniontech.com
Signed-off-by: zhanglianjie <zhanglianjie@uniontech.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Iurii Zaikin <yzaikin@google.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sysctl.c | 69 ---------------------------------------------------------
 1 file changed, 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a21c0ea396f3..36bfe1c92d44 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -100,8 +100,6 @@
 static const int six_hundred_forty_kb = 640 * 1024;
 #endif
 
-/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
-static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 
 static const int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -2263,55 +2261,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 	},
-	{
-		.procname	= "dirty_background_ratio",
-		.data		= &dirty_background_ratio,
-		.maxlen		= sizeof(dirty_background_ratio),
-		.mode		= 0644,
-		.proc_handler	= dirty_background_ratio_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE_HUNDRED,
-	},
-	{
-		.procname	= "dirty_background_bytes",
-		.data		= &dirty_background_bytes,
-		.maxlen		= sizeof(dirty_background_bytes),
-		.mode		= 0644,
-		.proc_handler	= dirty_background_bytes_handler,
-		.extra1		= SYSCTL_LONG_ONE,
-	},
-	{
-		.procname	= "dirty_ratio",
-		.data		= &vm_dirty_ratio,
-		.maxlen		= sizeof(vm_dirty_ratio),
-		.mode		= 0644,
-		.proc_handler	= dirty_ratio_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE_HUNDRED,
-	},
-	{
-		.procname	= "dirty_bytes",
-		.data		= &vm_dirty_bytes,
-		.maxlen		= sizeof(vm_dirty_bytes),
-		.mode		= 0644,
-		.proc_handler	= dirty_bytes_handler,
-		.extra1		= (void *)&dirty_bytes_min,
-	},
-	{
-		.procname	= "dirty_writeback_centisecs",
-		.data		= &dirty_writeback_interval,
-		.maxlen		= sizeof(dirty_writeback_interval),
-		.mode		= 0644,
-		.proc_handler	= dirty_writeback_centisecs_handler,
-	},
-	{
-		.procname	= "dirty_expire_centisecs",
-		.data		= &dirty_expire_interval,
-		.maxlen		= sizeof(dirty_expire_interval),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-	},
 	{
 		.procname	= "dirtytime_expire_seconds",
 		.data		= &dirtytime_expire_interval,
@@ -2483,13 +2432,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 	},
 #endif
-	{
-		.procname	= "laptop_mode",
-		.data		= &laptop_mode,
-		.maxlen		= sizeof(laptop_mode),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
 	{
 		.procname	= "vfs_cache_pressure",
 		.data		= &sysctl_vfs_cache_pressure,
@@ -2587,17 +2529,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 	},
 #endif
-#ifdef CONFIG_HIGHMEM
-	{
-		.procname	= "highmem_is_dirtyable",
-		.data		= &vm_highmem_is_dirtyable,
-		.maxlen		= sizeof(vm_highmem_is_dirtyable),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{
 		.procname	= "memory_failure_early_kill",
-- 
cgit v1.2.3


From f79c9b8ae8bde10126586c1bb55b5fd027276d8e Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Fri, 18 Feb 2022 18:58:57 +0800
Subject: kernel/lockdep: move lockdep sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the prove_locking and lock_stat sysctls
to its own file, kernel/lockdep.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/locking/lockdep.c | 35 +++++++++++++++++++++++++++++++++--
 kernel/sysctl.c          | 21 ---------------------
 2 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c06cab6546ed..a4382ae1be59 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -64,19 +64,50 @@
 #include <trace/events/lock.h>
 
 #ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
 module_param(prove_locking, int, 0644);
 #else
 #define prove_locking 0
 #endif
 
 #ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
 module_param(lock_stat, int, 0644);
 #else
 #define lock_stat 0
 #endif
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		.procname       = "prove_locking",
+		.data           = &prove_locking,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+	{
+		.procname       = "lock_stat",
+		.data           = &lock_stat,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif /* CONFIG_LOCK_STAT */
+	{ }
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_lockdep_table);
+	return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
 DEFINE_PER_CPU(unsigned int, lockdep_recursion);
 EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 36bfe1c92d44..95380d250c8c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -88,9 +88,6 @@
 #ifdef CONFIG_RT_MUTEXES
 #include <linux/rtmutex.h>
 #endif
-#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
-#include <linux/lockdep.h>
-#endif
 
 #if defined(CONFIG_SYSCTL)
 
@@ -1679,24 +1676,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_FOUR,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		.procname	= "prove_locking",
-		.data		= &prove_locking,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
-#ifdef CONFIG_LOCK_STAT
-	{
-		.procname	= "lock_stat",
-		.data		= &lock_stat,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 	{
 		.procname	= "panic",
 		.data		= &panic_timeout,
-- 
cgit v1.2.3


From 9df918698408fd914493aba0b7858fef50eba63a Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Fri, 18 Feb 2022 18:59:12 +0800
Subject: kernel/panic: move panic sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the oops_all_cpu_backtrace sysctl to
its own file, kernel/panic.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/panic.c  | 26 +++++++++++++++++++++++++-
 kernel/sysctl.c | 11 -----------
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index eb4dfb932c85..eb3f2fe4f6d7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -43,7 +43,9 @@
  * Should we dump all CPUs backtraces in an oops event?
  * Defaults to 0, can be changed via sysctl.
  */
-unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
 #endif /* CONFIG_SMP */
 
 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
@@ -73,6 +75,28 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL)
+static struct ctl_table kern_panic_table[] = {
+	{
+		.procname       = "oops_all_cpu_backtrace",
+		.data           = &sysctl_oops_all_cpu_backtrace,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{ }
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_panic_table);
+	return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
 static long no_blink(int state)
 {
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 95380d250c8c..90fc2212b536 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1922,17 +1922,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SMP
-	{
-		.procname	= "oops_all_cpu_backtrace",
-		.data		= &sysctl_oops_all_cpu_backtrace,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_SMP */
 	{
 		.procname	= "pid_max",
 		.data		= &pid_max,
-- 
cgit v1.2.3


From 801b501439d1b366d524dee4fc1e6b3473a95b9a Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Fri, 18 Feb 2022 18:59:23 +0800
Subject: kernel/acct: move acct sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the acct sysctl to its own file,
kernel/acct.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/acct.c   | 22 +++++++++++++++++++++-
 kernel/sysctl.c | 12 ------------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 3df53cf1dcd5..13706356ec54 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -70,11 +70,31 @@
  * Turned into sysctl-controllable parameters. AV, 12/11/98
  */
 
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
 #define RESUME		(acct_parm[0])	/* >foo% free space - resume */
 #define SUSPEND		(acct_parm[1])	/* <foo% free space - suspend */
 #define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_acct_table[] = {
+	{
+		.procname       = "acct",
+		.data           = &acct_parm,
+		.maxlen         = 3*sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_acct_table);
+	return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
 /*
  * External references and all of the globals.
  */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 90fc2212b536..5421e28dbb25 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,9 +82,6 @@
 #ifdef CONFIG_SPARC
 #include <asm/setup.h>
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-#include <linux/acct.h>
-#endif
 #ifdef CONFIG_RT_MUTEXES
 #include <linux/rtmutex.h>
 #endif
@@ -1856,15 +1853,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dostring,
 	},
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-	{
-		.procname	= "acct",
-		.data		= &acct_parm,
-		.maxlen		= 3*sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 	{
 		.procname	= "sysrq",
-- 
cgit v1.2.3


From 1186618a6a35d43a865448472a261184b608d13c Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Fri, 18 Feb 2022 18:59:36 +0800
Subject: kernel/delayacct: move delayacct sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the delayacct sysctl to its own file,
kernel/delayacct.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/delayacct.c | 22 +++++++++++++++++++++-
 kernel/sysctl.c    | 12 ------------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c5e8cea9e05f..2c1e18f7c5cf 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,7 +44,7 @@ void delayacct_init(void)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
 		     size_t *lenp, loff_t *ppos)
 {
 	int state = delayacct_on;
@@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
 		set_delayacct(state);
 	return err;
 }
+
+static struct ctl_table kern_delayacct_table[] = {
+	{
+		.procname       = "task_delayacct",
+		.data           = NULL,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sysctl_delayacct,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{ }
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_delayacct_table);
+	return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
 #endif
 
 void __delayacct_tsk_init(struct task_struct *tsk)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5421e28dbb25..9b74ba12a711 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,7 +67,6 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/latencytop.h>
 #include <linux/pid.h>
-#include <linux/delayacct.h>
 
 #include "../lib/kstrtox.h"
 
@@ -1651,17 +1650,6 @@ int proc_do_static_key(struct ctl_table *table, int write,
 }
 
 static struct ctl_table kern_table[] = {
-#ifdef CONFIG_TASK_DELAY_ACCT
-	{
-		.procname	= "task_delayacct",
-		.data		= NULL,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_delayacct,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_TASK_DELAY_ACCT */
 #ifdef CONFIG_NUMA_BALANCING
 	{
 		.procname	= "numa_balancing",
-- 
cgit v1.2.3


From d772cc2c321900f3f463a124eebeb7218e66dda6 Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Fri, 18 Feb 2022 18:59:49 +0800
Subject: kernel/do_mount_initrd: move real_root_dev sysctls to its own file

kernel/sysctl.c is a kitchen sink where everyone leaves their dirty
dishes, this makes it very difficult to maintain.

To help with this maintenance let's start by moving sysctls to places
where they actually belong.  The proc sysctl maintainers do not want to
know what sysctl knobs you wish to add for your own piece of code, we
just care about the core logic.

All filesystem syctls now get reviewed by fs folks. This commit
follows the commit of fs, move the real_root_dev sysctl to its own file,
kernel/do_mount_initrd.c.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sysctl.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9b74ba12a711..10a551f8fcab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1693,15 +1693,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sysctl_latencytop,
 	},
-#endif
-#ifdef CONFIG_BLK_DEV_INITRD
-	{
-		.procname	= "real-root-dev",
-		.data		= &real_root_dev,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #endif
 	{
 		.procname	= "print-fatal-signals",
-- 
cgit v1.2.3


From 8e4e83b2278bdfb55cb2b13de07cf0a721ce8af7 Mon Sep 17 00:00:00 2001
From: Wei Xiao <xiaowei66@huawei.com>
Date: Wed, 23 Feb 2022 19:11:53 +0800
Subject: ftrace: move sysctl_ftrace_enabled to ftrace.c

This moves ftrace_enabled to trace/ftrace.c.

We move sysctls to places where features actually belong to improve
the readability of the code and reduce the risk of code merge conflicts.
At the same time, the proc-sysctl maintainers do not want to know what
sysctl knobs you wish to add for your owner piece of code, we just care
about the core logic.

Signed-off-by: Wei Xiao <xiaowei66@huawei.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sysctl.c       |  9 ---------
 kernel/trace/ftrace.c | 22 +++++++++++++++++++++-
 2 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 10a551f8fcab..21172d3dad6e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1751,15 +1751,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_FUNCTION_TRACER
-	{
-		.procname	= "ftrace_enabled",
-		.data		= &ftrace_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= ftrace_enable_sysctl,
-	},
-#endif
 #ifdef CONFIG_STACK_TRACER
 	{
 		.procname	= "stack_tracer_enabled",
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f1d2f5e7263..a5efbbc289b4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7921,7 +7921,8 @@ static bool is_permanent_ops_registered(void)
 	return false;
 }
 
-int
+#ifdef CONFIG_SYSCTL
+static int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
 		     void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -7964,3 +7965,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	mutex_unlock(&ftrace_lock);
 	return ret;
 }
+
+static struct ctl_table ftrace_sysctls[] = {
+	{
+		.procname       = "ftrace_enabled",
+		.data           = &ftrace_enabled,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = ftrace_enable_sysctl,
+	},
+	{}
+};
+
+static int __init ftrace_sysctl_init(void)
+{
+	register_sysctl_init("kernel", ftrace_sysctls);
+	return 0;
+}
+late_initcall(ftrace_sysctl_init);
+#endif
-- 
cgit v1.2.3


From 807ff7ed34d2c83e09d7987bd6506cc3d520dcdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 7 Apr 2022 13:43:15 +0200
Subject: futex: add missing rtmutex.h include
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This isn't included here any more since the removal of ww_mutex.h from
seqlock.h which causes a build break.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Shashank Sharma <shashank.sharma@amd.com>
Fixes: e84815cbbc76 ("seqlock: drop seqcount_ww_mutex_t")
Link: https://patchwork.freedesktop.org/patch/msgid/20220407114619.961750-1-christian.koenig@amd.com
---
 kernel/futex/futex.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c264cbeab71c..b5379c0e6d6d 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -3,6 +3,7 @@
 #define _FUTEX_H
 
 #include <linux/futex.h>
+#include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
 
 #ifdef CONFIG_PREEMPT_RT
-- 
cgit v1.2.3


From bae1a962ac2c5e6be08319ff3f7d6df542584fce Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Wed, 6 Apr 2022 02:29:33 +0300
Subject: x86/topology: Disable CPU online/offline control for TDX guests

Unlike regular VMs, TDX guests use the firmware hand-off wakeup method
to wake up the APs during the boot process. This wakeup model uses a
mailbox to communicate with firmware to bring up the APs. As per the
design, this mailbox can only be used once for the given AP, which means
after the APs are booted, the same mailbox cannot be used to
offline/online the given AP. More details about this requirement can be
found in Intel TDX Virtual Firmware Design Guide, sec titled "AP
initialization in OS" and in sec titled "Hotplug Device".

Since the architecture does not support any method of offlining the
CPUs, disable CPU hotplug support in the kernel.

Since this hotplug disable feature can be re-used by other VM guests,
add a new CC attribute CC_ATTR_HOTPLUG_DISABLED and use it to disable
the hotplug support.

Attempt to offline CPU will fail with -EOPNOTSUPP.

Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20220405232939.73860-25-kirill.shutemov@linux.intel.com
---
 kernel/cpu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5797c2a7a93f..edb8c199f6a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -35,6 +35,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/cpuset.h>
 #include <linux/random.h>
+#include <linux/cc_platform.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -1186,6 +1187,12 @@ out:
 
 static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 {
+	/*
+	 * If the platform does not support hotplug, report it explicitly to
+	 * differentiate it from a transient offlining failure.
+	 */
+	if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+		return -EOPNOTSUPP;
 	if (cpu_hotplug_disabled)
 		return -EBUSY;
 	return _cpu_down(cpu, 0, target);
-- 
cgit v1.2.3


From b45043192b3e481304062938a6561da2ceea46a6 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Thu, 7 Apr 2022 21:04:23 +0800
Subject: bpf: Fix excessive memory allocation in stack_map_alloc()

The 'n_buckets * (value_size + sizeof(struct stack_map_bucket))' part of the
allocated memory for 'smap' is never used after the memlock accounting was
removed, thus get rid of it.

[ Note, Daniel:

Commit b936ca643ade ("bpf: rework memlock-based memory accounting for maps")
moved `cost += n_buckets * (value_size + sizeof(struct stack_map_bucket))`
up and therefore before the bpf_map_area_alloc() allocation, sigh. In a later
step commit c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()"),
and the overflow checks of `cost >= U32_MAX - PAGE_SIZE` moved into
bpf_map_charge_init(). And then 370868107bf6 ("bpf: Eliminate rlimit-based
memory accounting for stackmap maps") finally removed the bpf_map_charge_init().
Anyway, the original code did the allocation same way as /after/ this fix. ]

Fixes: b936ca643ade ("bpf: rework memlock-based memory accounting for maps")
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220407130423.798386-1-ytcoode@gmail.com
---
 kernel/bpf/stackmap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6131b4a19572..1dd5266fbebb 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -100,7 +100,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-E2BIG);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
-	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From c54bc0fc84214b203f7a0ebfd1bd308ce2abe920 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 5 Apr 2022 21:17:32 +0200
Subject: timers: Fix warning condition in __run_timers()

When the timer base is empty, base::next_expiry is set to base::clk +
NEXT_TIMER_MAX_DELTA and base::next_expiry_recalc is false. When no timer
is queued until jiffies reaches base::next_expiry value, the warning for
not finding any expired timer and base::next_expiry_recalc is false in
__run_timers() triggers.

To prevent triggering the warning in this valid scenario
base::timers_pending needs to be added to the warning condition.

Fixes: 31cd0e119d50 ("timers: Recalculate next timer interrupt only when necessary")
Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220405191732.7438-3-anna-maria@linutronix.de
---
 kernel/time/timer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 85f1021ad459..9dd2a39cb3b0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1722,11 +1722,14 @@ static inline void __run_timers(struct timer_base *base)
 	       time_after_eq(jiffies, base->next_expiry)) {
 		levels = collect_expired_timers(base, heads);
 		/*
-		 * The only possible reason for not finding any expired
-		 * timer at this clk is that all matching timers have been
-		 * dequeued.
+		 * The two possible reasons for not finding any expired
+		 * timer at this clk are that all matching timers have been
+		 * dequeued or no timer has been queued since
+		 * base::next_expiry was set to base::clk +
+		 * NEXT_TIMER_MAX_DELTA.
 		 */
-		WARN_ON_ONCE(!levels && !base->next_expiry_recalc);
+		WARN_ON_ONCE(!levels && !base->next_expiry_recalc
+			     && base->timers_pending);
 		base->clk++;
 		base->next_expiry = __next_timer_interrupt(base);
 
-- 
cgit v1.2.3


From 2731aa7d65dbb31c6dad14347c37d522bb3bc7c6 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 5 Apr 2022 21:17:31 +0200
Subject: timers: Initialize base::next_expiry_recalc in timers_prepare_cpu()

When base::next_expiry_recalc is not initialized to false during cpu
bringup in HOTPLUG_CPU and is accidently true and no timer is queued in the
meantime, the loop through the wheel to find __next_timer_interrupt() might
be done for nothing.

Therefore initialize base::next_expiry_recalc to false in
timers_prepare_cpu().

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220405191732.7438-2-anna-maria@linutronix.de
---
 kernel/time/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9dd2a39cb3b0..204d6cd83d0e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1953,6 +1953,7 @@ int timers_prepare_cpu(unsigned int cpu)
 		base = per_cpu_ptr(&timer_bases[b], cpu);
 		base->clk = jiffies;
 		base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+		base->next_expiry_recalc = false;
 		base->timers_pending = false;
 		base->is_idle = false;
 	}
-- 
cgit v1.2.3


From a2026e44eff5d74a83d7ffee6325a007bef85385 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Apr 2022 16:47:55 +0200
Subject: timers: Simplify calc_index()

The level granularity round up of calc_index() does:

   (x + (1 << n)) >> n

which is obviously equivalent to

   (x >> n) + 1

but compilers can't figure that out despite the fact that the input range
is known to not cause an overflow. It's neither intuitive to read.

Just write out the obvious.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/87h778j46c.ffs@tglx
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 204d6cd83d0e..60aebf2b7f0a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -502,7 +502,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl,
 	 *
 	 * Round up with level granularity to prevent this.
 	 */
-	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+	expires = (expires >> LVL_SHIFT(lvl)) + 1;
 	*bucket_expiry = expires << LVL_SHIFT(lvl);
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
 }
-- 
cgit v1.2.3


From 40e97e42961f8c6cc7bd5fe67cc18417e02d78f1 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 6 Dec 2021 09:59:50 -0500
Subject: tick/nohz: Use WARN_ON_ONCE() to prevent console saturation

While running some testing on code that happened to allow the variable
tick_nohz_full_running to get set but with no "possible" NOHZ cores to
back up that setting, this warning triggered:

        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                WARN_ON(tick_nohz_full_running);

The console was overwhemled with an endless stream of one WARN per tick
per core and there was no way to even see what was going on w/o using a
serial console to capture it and then trace it back to this.

Change it to WARN_ON_ONCE().

Fixes: 08ae95f4fd3b ("nohz_full: Allow the boot CPU to be nohz_full")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20211206145950.10927-3-paul.gortmaker@windriver.com
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2d76c91b85de..3506f6ed790c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -188,7 +188,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	 */
 	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
 #ifdef CONFIG_NO_HZ_FULL
-		WARN_ON(tick_nohz_full_running);
+		WARN_ON_ONCE(tick_nohz_full_running);
 #endif
 		tick_do_timer_cpu = cpu;
 	}
-- 
cgit v1.2.3


From 9c95bc25ad3b1a2240cd1f896569292a57d3ce85 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Mon, 14 Feb 2022 16:47:39 +0800
Subject: tick/sched: Fix non-kernel-doc comment

Fixes the following W=1 kernel build warning:

kernel/time/tick-sched.c:1563: warning: This comment starts with '/**',
but isn't a kernel-doc comment.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220214084739.63228-1-jiapeng.chong@linux.alibaba.com
---
 kernel/time/tick-sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3506f6ed790c..d257721c68b8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1538,7 +1538,7 @@ void tick_cancel_sched_timer(int cpu)
 }
 #endif
 
-/**
+/*
  * Async notification about clocksource changes
  */
 void tick_clock_notify(void)
@@ -1559,7 +1559,7 @@ void tick_oneshot_notify(void)
 	set_bit(0, &ts->check_clocks);
 }
 
-/**
+/*
  * Check, if a change happened, which makes oneshot possible.
  *
  * Called cyclic from the hrtimer softirq (driven by the timer
-- 
cgit v1.2.3


From 2966a9918dfab183a1cb6be6794981ebe2abff83 Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Thu, 31 Mar 2022 23:57:07 +0200
Subject: clockevents: Use dedicated list iterator variable

To move the list iterator variable into the list_for_each_entry_*()
macro in the future it should be avoided to use the list iterator
variable after the loop body.

To *never* use the list iterator variable after the loop it was
concluded to use a separate iterator variable.

Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/
Link: https://lore.kernel.org/r/20220331215707.883957-1-jakobkoschel@gmail.com
---
 kernel/time/clockevents.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 003ccf338d20..5d85014d59b5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -690,7 +690,7 @@ static ssize_t unbind_device_store(struct device *dev,
 {
 	char name[CS_NAME_LEN];
 	ssize_t ret = sysfs_get_uname(buf, name, count);
-	struct clock_event_device *ce;
+	struct clock_event_device *ce = NULL, *iter;
 
 	if (ret < 0)
 		return ret;
@@ -698,9 +698,10 @@ static ssize_t unbind_device_store(struct device *dev,
 	ret = -ENODEV;
 	mutex_lock(&clockevents_mutex);
 	raw_spin_lock_irq(&clockevents_lock);
-	list_for_each_entry(ce, &clockevent_devices, list) {
-		if (!strcmp(ce->name, name)) {
-			ret = __clockevents_try_unbind(ce, dev->id);
+	list_for_each_entry(iter, &clockevent_devices, list) {
+		if (!strcmp(iter->name, name)) {
+			ret = __clockevents_try_unbind(iter, dev->id);
+			ce = iter;
 			break;
 		}
 	}
-- 
cgit v1.2.3


From efaa0227f6c6a5073951b20cf2f8c63c4155306c Mon Sep 17 00:00:00 2001
From: tangmeng <tangmeng@uniontech.com>
Date: Tue, 15 Feb 2022 14:50:19 +0800
Subject: timers: Move timer sysctl into the timer code

This is part of the effort to reduce kernel/sysctl.c to only contain the
core logic.

Signed-off-by: tangmeng <tangmeng@uniontech.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220215065019.7520-1-tangmeng@uniontech.com
---
 kernel/sysctl.c     | 11 -----------
 kernel/time/timer.c | 53 ++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 830aaf8ca08e..5b7b1a82ae6a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2288,17 +2288,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-	{
-		.procname	= "timer_migration",
-		.data		= &sysctl_timer_migration,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= timer_migration_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_BPF_SYSCALL
 	{
 		.procname	= "unprivileged_bpf_disabled",
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 60aebf2b7f0a..ef082d43c307 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -44,6 +44,7 @@
 #include <linux/slab.h>
 #include <linux/compat.h>
 #include <linux/random.h>
+#include <linux/sysctl.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work);
 static DECLARE_WORK(timer_update_work, timer_update_keys);
 
 #ifdef CONFIG_SMP
-unsigned int sysctl_timer_migration = 1;
+static unsigned int sysctl_timer_migration = 1;
 
 DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
 
@@ -234,7 +235,42 @@ static void timers_update_migration(void)
 	else
 		static_branch_disable(&timers_migration_enabled);
 }
-#else
+
+#ifdef CONFIG_SYSCTL
+static int timer_migration_handler(struct ctl_table *table, int write,
+			    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&timer_keys_mutex);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (!ret && write)
+		timers_update_migration();
+	mutex_unlock(&timer_keys_mutex);
+	return ret;
+}
+
+static struct ctl_table timer_sysctl[] = {
+	{
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= timer_migration_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{}
+};
+
+static int __init timer_sysctl_init(void)
+{
+	register_sysctl("kernel", timer_sysctl);
+	return 0;
+}
+device_initcall(timer_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+#else /* CONFIG_SMP */
 static inline void timers_update_migration(void) { }
 #endif /* !CONFIG_SMP */
 
@@ -251,19 +287,6 @@ void timers_update_nohz(void)
 	schedule_work(&timer_update_work);
 }
 
-int timer_migration_handler(struct ctl_table *table, int write,
-			    void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret;
-
-	mutex_lock(&timer_keys_mutex);
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (!ret && write)
-		timers_update_migration();
-	mutex_unlock(&timer_keys_mutex);
-	return ret;
-}
-
 static inline bool is_timers_nohz_active(void)
 {
 	return static_branch_unlikely(&timers_nohz_active);
-- 
cgit v1.2.3


From d802057c7c553ad426520a053da9f9fe08e2c35a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 5 Apr 2022 19:50:38 +0100
Subject: genirq/msi: Shutdown managed interrupts with unsatifiable affinities

When booting with maxcpus=<small number>, interrupt controllers
such as the GICv3 ITS may not be able to satisfy the affinity of
some managed interrupts, as some of the HW resources are simply
not available.

The same thing happens when loading a driver using managed interrupts
while CPUs are offline.

In order to deal with this, do not try to activate such interrupt
if there is no online CPU capable of handling it. Instead, place
it in shutdown state. Once a capable CPU shows up, it will be
activated.

Reported-by: John Garry <john.garry@huawei.com>
Reported-by: David Decotigny <ddecotig@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/20220405185040.206297-2-maz@kernel.org
---
 kernel/irq/msi.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2bdfce5edafd..a9ee535293eb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
 		irqd_clr_can_reserve(irqd);
 		if (vflags & VIRQ_NOMASK_QUIRK)
 			irqd_set_msi_nomask_quirk(irqd);
+
+		/*
+		 * If the interrupt is managed but no CPU is available to
+		 * service it, shut it down until better times. Note that
+		 * we only do this on the !RESERVE path as x86 (the only
+		 * architecture using this flag) deals with this in a
+		 * different way by using a catch-all vector.
+		 */
+		if ((vflags & VIRQ_ACTIVATE) &&
+		    irqd_affinity_is_managed(irqd) &&
+		    !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+					cpu_online_mask)) {
+			    irqd_set_managed_shutdown(irqd);
+			    return 0;
+		    }
 	}
 
 	if (!(vflags & VIRQ_ACTIVATE))
-- 
cgit v1.2.3


From 33de0aa4bae982ed6f7c777f86b5af3e627ac937 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 5 Apr 2022 19:50:39 +0100
Subject: genirq: Always limit the affinity to online CPUs

When booting with maxcpus=<small number> (or even loading a driver
while most CPUs are offline), it is pretty easy to observe managed
affinities containing a mix of online and offline CPUs being passed
to the irqchip driver.

This means that the irqchip cannot trust the affinity passed down
from the core code, which is a bit annoying and requires (at least
in theory) all drivers to implement some sort of affinity narrowing.

In order to address this, always limit the cpumask to the set of
online CPUs.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220405185040.206297-3-maz@kernel.org
---
 kernel/irq/manage.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c03f71d5ec10..f71ecc100545 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -222,11 +222,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 {
 	struct irq_desc *desc = irq_data_to_desc(data);
 	struct irq_chip *chip = irq_data_get_irq_chip(data);
+	const struct cpumask  *prog_mask;
 	int ret;
 
+	static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
+	static struct cpumask tmp_mask;
+
 	if (!chip || !chip->irq_set_affinity)
 		return -EINVAL;
 
+	raw_spin_lock(&tmp_mask_lock);
 	/*
 	 * If this is a managed interrupt and housekeeping is enabled on
 	 * it check whether the requested affinity mask intersects with
@@ -248,24 +253,28 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	 */
 	if (irqd_affinity_is_managed(data) &&
 	    housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
-		const struct cpumask *hk_mask, *prog_mask;
-
-		static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
-		static struct cpumask tmp_mask;
+		const struct cpumask *hk_mask;
 
 		hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
 
-		raw_spin_lock(&tmp_mask_lock);
 		cpumask_and(&tmp_mask, mask, hk_mask);
 		if (!cpumask_intersects(&tmp_mask, cpu_online_mask))
 			prog_mask = mask;
 		else
 			prog_mask = &tmp_mask;
-		ret = chip->irq_set_affinity(data, prog_mask, force);
-		raw_spin_unlock(&tmp_mask_lock);
 	} else {
-		ret = chip->irq_set_affinity(data, mask, force);
+		prog_mask = mask;
 	}
+
+	/* Make sure we only provide online CPUs to the irqchip */
+	cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
+	if (!cpumask_empty(&tmp_mask))
+		ret = chip->irq_set_affinity(data, &tmp_mask, force);
+	else
+		ret = -EINVAL;
+
+	raw_spin_unlock(&tmp_mask_lock);
+
 	switch (ret) {
 	case IRQ_SET_MASK_OK:
 	case IRQ_SET_MASK_OK_DONE:
-- 
cgit v1.2.3


From 911488de0565f1d53bd36174d20917ebc4b44c0e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 10 Feb 2022 14:49:05 -0800
Subject: genirq/affinity: Replace cpumask_weight() with cpumask_empty() where
 appropriate

__irq_build_affinity_masks() calls cpumask_weight() to check if any bit of
a given cpumask is set.

This can be done more efficiently with cpumask_empty() because
cpumask_empty() stops traversing the cpumask as soon as it finds first set
bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220210224933.379149-22-yury.norov@gmail.com
---
 kernel/irq/affinity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f7ff8919dc9b..18740faf0eb1 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -258,7 +258,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct node_vectors *node_vectors;
 
-	if (!cpumask_weight(cpu_mask))
+	if (cpumask_empty(cpu_mask))
 		return 0;
 
 	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
-- 
cgit v1.2.3


From 8afbcaf8690dac19ebf570a4e4fef9c59c75bf8e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 10 Feb 2022 14:49:07 -0800
Subject: clocksource: Replace cpumask_weight() with cpumask_empty()

clocksource_verify_percpu() calls cpumask_weight() to check if any bit of a
given cpumask is set.

This can be done more efficiently with cpumask_empty() because
cpumask_empty() stops traversing the cpumask as soon as it finds first set
bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220210224933.379149-24-yury.norov@gmail.com
---
 kernel/time/clocksource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 95d7ca35bdf2..cee5da1e54c4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
 	cpus_read_lock();
 	preempt_disable();
 	clocksource_verify_choose_cpus();
-	if (cpumask_weight(&cpus_chosen) == 0) {
+	if (cpumask_empty(&cpus_chosen)) {
 		preempt_enable();
 		cpus_read_unlock();
 		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
-- 
cgit v1.2.3


From 08d835dff916bfe8f45acc7b92c7af6c4081c8a7 Mon Sep 17 00:00:00 2001
From: Rei Yamamoto <yamamoto.rei@jp.fujitsu.com>
Date: Thu, 31 Mar 2022 09:33:09 +0900
Subject: genirq/affinity: Consider that CPUs on nodes can be unbalanced

If CPUs on a node are offline at boot time, the number of nodes is
different when building affinity masks for present cpus and when building
affinity masks for possible cpus. This causes the following problem:

In the case that the number of vectors is less than the number of nodes
there are cases where bits of masks for present cpus are overwritten when
building masks for possible cpus.

Fix this by excluding CPUs, which are not part of the current build mask
(present/possible).

[ tglx: Massaged changelog and added comment ]

Fixes: b82592199032 ("genirq/affinity: Spread IRQs to all available NUMA nodes")
Signed-off-by: Rei Yamamoto <yamamoto.rei@jp.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220331003309.10891-1-yamamoto.rei@jp.fujitsu.com
---
 kernel/irq/affinity.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f7ff8919dc9b..fdf170404650 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
-				   node_to_cpumask[n]);
+			/* Ensure that only CPUs which are in both masks are set */
+			cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
-- 
cgit v1.2.3


From aa1b02e674fe69acd04624f5bcdef94928bc8695 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sun, 10 Apr 2022 14:00:19 +0800
Subject: bpf: Remove redundant assignment to meta.seq in __task_seq_show()

The seq argument is assigned to meta.seq twice, the second one is
redundant, remove it.

This patch also removes a redundant space in bpf_iter_link_attach().

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20220410060020.307283-1-ytcoode@gmail.com
---
 kernel/bpf/bpf_iter.c  | 2 +-
 kernel/bpf/task_iter.c | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dea920b3b840..d5d96ceca105 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -545,7 +545,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
 	link->tinfo = tinfo;
 
-	err  = bpf_link_prime(&link->link, &link_primer);
+	err = bpf_link_prime(&link->link, &link_primer);
 	if (err) {
 		kfree(link);
 		return err;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d94696198ef8..8c921799def4 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -99,7 +99,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
 	if (!prog)
 		return 0;
 
-	meta.seq = seq;
 	ctx.meta = &meta;
 	ctx.task = task;
 	return bpf_iter_run_prog(prog, &ctx);
-- 
cgit v1.2.3


From 8ed00760203d8018bee042fbfe8e076579be2c2b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 12 Jan 2022 09:52:44 -0800
Subject: srcu: Tighten cleanup_srcu_struct() GP checks

Currently, cleanup_srcu_struct() checks for a grace period in progress,
but it does not check for a grace period that has not yet started but
which might start at any time.  Such a situation could result in a
use-after-free bug, so this commit adds a check for a grace period that
is needed but not yet started to cleanup_srcu_struct().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6833d8887181..d30e4db04506 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -382,9 +382,11 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 			return; /* Forgot srcu_barrier(), so just leak it! */
 	}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+	    WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
 	    WARN_ON(srcu_readers_active(ssp))) {
-		pr_info("%s: Active srcu_struct %p state: %d\n",
-			__func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+		pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+			__func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
+			rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
 	free_percpu(ssp->sda);
-- 
cgit v1.2.3


From 7b9e9b5856e188c1b3ff51185f3600ee79b4ab41 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 20 Jan 2022 13:39:33 -0800
Subject: srcu: Make srcu_funnel_gp_start() cache ->mynode in snp_leaf

Currently, the srcu_funnel_gp_start() walks its local variable snp up the
tree and reloads sdp->mynode whenever it is necessary to check whether
it is still at the leaf srcu_node level.  This works, but is a bit more
obtuse than absolutely necessary.  In addition, upcoming commits will
dynamically size srcu_struct structures, in which case sdp->mynode will
no longer necessarily be a constant, and this commit helps prepare for
that dynamic sizing.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d30e4db04506..7d13e35e5d27 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -632,20 +632,21 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 {
 	unsigned long flags;
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
-	struct srcu_node *snp = sdp->mynode;
+	struct srcu_node *snp;
+	struct srcu_node *snp_leaf = sdp->mynode;
 	unsigned long snp_seq;
 
 	/* Each pass through the loop does one level of the srcu_node tree. */
-	for (; snp != NULL; snp = snp->srcu_parent) {
-		if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
+	for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+		if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
 			return; /* GP already done and CBs recorded. */
 		spin_lock_irqsave_rcu_node(snp, flags);
 		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
 			snp_seq = snp->srcu_have_cbs[idx];
-			if (snp == sdp->mynode && snp_seq == s)
+			if (snp == snp_leaf && snp_seq == s)
 				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 			spin_unlock_irqrestore_rcu_node(snp, flags);
-			if (snp == sdp->mynode && snp_seq != s) {
+			if (snp == snp_leaf && snp_seq != s) {
 				srcu_schedule_cbs_sdp(sdp, do_norm
 							   ? SRCU_INTERVAL
 							   : 0);
@@ -656,7 +657,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
-		if (snp == sdp->mynode)
+		if (snp == snp_leaf)
 			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 		if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
 			WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
-- 
cgit v1.2.3


From 994f706872e6ce080506bd795ecf783d5b617de6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 24 Jan 2022 09:46:57 -0800
Subject: srcu: Make Tree SRCU able to operate without snp_node array

This commit makes Tree SRCU able to operate without an snp_node
array, that is, when the srcu_data structures' ->mynode pointers
are NULL.  This can result in high contention on the srcu_struct
structure's ->lock, but only when there are lots of call_srcu(),
synchronize_srcu(), and synchronize_srcu_expedited() calls.

Note that when there is no snp_node array, all SRCU callbacks use
CPU 0's callback queue.  This is optimal in the common case of low
update-side load because it removes the need to search each CPU
for the single callback that made the grace period happen.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 203 +++++++++++++++++++++++++++-----------------------
 1 file changed, 111 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7d13e35e5d27..e23696edd43b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -152,16 +152,17 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 		sdp->ssp = ssp;
 		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 	}
+	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
 }
 
 /*
  * Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures.  The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures.  The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
  */
 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 {
+	ssp->srcu_size_state = SRCU_SIZE_SMALL;
 	mutex_init(&ssp->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
@@ -175,6 +176,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	if (!ssp->sda)
 		return -ENOMEM;
 	init_srcu_struct_nodes(ssp);
+	ssp->srcu_size_state = SRCU_SIZE_BIG;
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -391,6 +393,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 	}
 	free_percpu(ssp->sda);
 	ssp->sda = NULL;
+	ssp->srcu_size_state = SRCU_SIZE_SMALL;
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
@@ -439,6 +442,10 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
 	int state;
 
+	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+		sdp = per_cpu_ptr(ssp->sda, 0);
+	else
+		sdp = this_cpu_ptr(ssp->sda);
 	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
 	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
@@ -539,38 +546,40 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	/* A new grace period can start at this point.  But only one. */
 
 	/* Initiate callback invocation as needed. */
-	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
-	srcu_for_each_node_breadth_first(ssp, snp) {
-		spin_lock_irq_rcu_node(snp);
-		cbs = false;
-		last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
-		if (last_lvl)
-			cbs = snp->srcu_have_cbs[idx] == gpseq;
-		snp->srcu_have_cbs[idx] = gpseq;
-		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
-		if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
-			WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
-		mask = snp->srcu_data_have_cbs[idx];
-		snp->srcu_data_have_cbs[idx] = 0;
-		spin_unlock_irq_rcu_node(snp);
-		if (cbs)
-			srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
-		/* Occasionally prevent srcu_data counter wrap. */
-		if (!(gpseq & counter_wrap_check) && last_lvl)
-			for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
-				sdp = per_cpu_ptr(ssp->sda, cpu);
-				spin_lock_irqsave_rcu_node(sdp, flags);
-				if (ULONG_CMP_GE(gpseq,
-						 sdp->srcu_gp_seq_needed + 100))
-					sdp->srcu_gp_seq_needed = gpseq;
-				if (ULONG_CMP_GE(gpseq,
-						 sdp->srcu_gp_seq_needed_exp + 100))
-					sdp->srcu_gp_seq_needed_exp = gpseq;
-				spin_unlock_irqrestore_rcu_node(sdp, flags);
-			}
+	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) {
+		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
+	} else {
+		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+		srcu_for_each_node_breadth_first(ssp, snp) {
+			spin_lock_irq_rcu_node(snp);
+			cbs = false;
+			last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+			if (last_lvl)
+				cbs = snp->srcu_have_cbs[idx] == gpseq;
+			snp->srcu_have_cbs[idx] = gpseq;
+			rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+			if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+				WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+			mask = snp->srcu_data_have_cbs[idx];
+			snp->srcu_data_have_cbs[idx] = 0;
+			spin_unlock_irq_rcu_node(snp);
+			if (cbs)
+				srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+		}
 	}
 
+	/* Occasionally prevent srcu_data counter wrap. */
+	if (!(gpseq & counter_wrap_check))
+		for_each_possible_cpu(cpu) {
+			sdp = per_cpu_ptr(ssp->sda, cpu);
+			spin_lock_irqsave_rcu_node(sdp, flags);
+			if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+				sdp->srcu_gp_seq_needed = gpseq;
+			if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+				sdp->srcu_gp_seq_needed_exp = gpseq;
+			spin_unlock_irqrestore_rcu_node(sdp, flags);
+		}
+
 	/* Callback initiation done, allow grace periods after next. */
 	mutex_unlock(&ssp->srcu_cb_mutex);
 
@@ -599,18 +608,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 {
 	unsigned long flags;
 
-	for (; snp != NULL; snp = snp->srcu_parent) {
-		if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
-		    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
-			return;
-		spin_lock_irqsave_rcu_node(snp, flags);
-		if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+	if (snp)
+		for (; snp != NULL; snp = snp->srcu_parent) {
+			if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
+			    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+				return;
+			spin_lock_irqsave_rcu_node(snp, flags);
+			if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+				spin_unlock_irqrestore_rcu_node(snp, flags);
+				return;
+			}
+			WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
 			spin_unlock_irqrestore_rcu_node(snp, flags);
-			return;
 		}
-		WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
-		spin_unlock_irqrestore_rcu_node(snp, flags);
-	}
 	spin_lock_irqsave_rcu_node(ssp, flags);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
 		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
@@ -633,36 +643,37 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 	unsigned long flags;
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
 	struct srcu_node *snp;
-	struct srcu_node *snp_leaf = sdp->mynode;
+	struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode);
 	unsigned long snp_seq;
 
-	/* Each pass through the loop does one level of the srcu_node tree. */
-	for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
-		if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
-			return; /* GP already done and CBs recorded. */
-		spin_lock_irqsave_rcu_node(snp, flags);
-		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
-			snp_seq = snp->srcu_have_cbs[idx];
-			if (snp == snp_leaf && snp_seq == s)
-				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
-			spin_unlock_irqrestore_rcu_node(snp, flags);
-			if (snp == snp_leaf && snp_seq != s) {
-				srcu_schedule_cbs_sdp(sdp, do_norm
-							   ? SRCU_INTERVAL
-							   : 0);
+	if (snp_leaf)
+		/* Each pass through the loop does one level of the srcu_node tree. */
+		for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+			if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
+				return; /* GP already done and CBs recorded. */
+			spin_lock_irqsave_rcu_node(snp, flags);
+			if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+				snp_seq = snp->srcu_have_cbs[idx];
+				if (snp == snp_leaf && snp_seq == s)
+					snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+				spin_unlock_irqrestore_rcu_node(snp, flags);
+				if (snp == snp_leaf && snp_seq != s) {
+					srcu_schedule_cbs_sdp(sdp, do_norm
+								   ? SRCU_INTERVAL
+								   : 0);
+					return;
+				}
+				if (!do_norm)
+					srcu_funnel_exp_start(ssp, snp, s);
 				return;
 			}
-			if (!do_norm)
-				srcu_funnel_exp_start(ssp, snp, s);
-			return;
+			snp->srcu_have_cbs[idx] = s;
+			if (snp == snp_leaf)
+				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+			if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+				WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+			spin_unlock_irqrestore_rcu_node(snp, flags);
 		}
-		snp->srcu_have_cbs[idx] = s;
-		if (snp == snp_leaf)
-			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
-		if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
-			WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
-		spin_unlock_irqrestore_rcu_node(snp, flags);
-	}
 
 	/* Top of tree, must ensure the grace period will be started. */
 	spin_lock_irqsave_rcu_node(ssp, flags);
@@ -820,7 +831,10 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 
 	check_init_srcu_struct(ssp);
 	idx = srcu_read_lock(ssp);
-	sdp = raw_cpu_ptr(ssp->sda);
+	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL)
+		sdp = per_cpu_ptr(ssp->sda, 0);
+	else
+		sdp = raw_cpu_ptr(ssp->sda);
 	spin_lock_irqsave_rcu_node(sdp, flags);
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
@@ -840,7 +854,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	if (needgp)
 		srcu_funnel_gp_start(ssp, sdp, s, do_norm);
 	else if (needexp)
-		srcu_funnel_exp_start(ssp, sdp->mynode, s);
+		srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s);
 	srcu_read_unlock(ssp, idx);
 	return s;
 }
@@ -1100,6 +1114,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
 		complete(&ssp->srcu_barrier_completion);
 }
 
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist.  but only if that ->cblist already has at least one
+ * callback enqueued.  Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+	spin_lock_irq_rcu_node(sdp);
+	atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+	sdp->srcu_barrier_head.func = srcu_barrier_cb;
+	debug_rcu_head_queue(&sdp->srcu_barrier_head);
+	if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+				   &sdp->srcu_barrier_head)) {
+		debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+		atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+	}
+	spin_unlock_irq_rcu_node(sdp);
+}
+
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @ssp: srcu_struct on which to wait for in-flight callbacks.
@@ -1107,7 +1143,6 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
 void srcu_barrier(struct srcu_struct *ssp)
 {
 	int cpu;
-	struct srcu_data *sdp;
 	unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
 
 	check_init_srcu_struct(ssp);
@@ -1123,27 +1158,11 @@ void srcu_barrier(struct srcu_struct *ssp)
 	/* Initial count prevents reaching zero until all CBs are posted. */
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
 
-	/*
-	 * Each pass through this loop enqueues a callback, but only
-	 * on CPUs already having callbacks enqueued.  Note that if
-	 * a CPU already has callbacks enqueue, it must have already
-	 * registered the need for a future grace period, so all we
-	 * need do is enqueue a callback that will use the same
-	 * grace period as the last callback already in the queue.
-	 */
-	for_each_possible_cpu(cpu) {
-		sdp = per_cpu_ptr(ssp->sda, cpu);
-		spin_lock_irq_rcu_node(sdp);
-		atomic_inc(&ssp->srcu_barrier_cpu_cnt);
-		sdp->srcu_barrier_head.func = srcu_barrier_cb;
-		debug_rcu_head_queue(&sdp->srcu_barrier_head);
-		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
-					   &sdp->srcu_barrier_head)) {
-			debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
-			atomic_dec(&ssp->srcu_barrier_cpu_cnt);
-		}
-		spin_unlock_irq_rcu_node(sdp);
-	}
+	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
+	else
+		for_each_possible_cpu(cpu)
+			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
 
 	/* Remove the initial count, at which point reaching zero can happen. */
 	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
-- 
cgit v1.2.3


From 2ec303113d978931ef368886c4c6bc854493e8bf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 21 Jan 2022 16:13:52 -0800
Subject: srcu: Dynamically allocate srcu_node array

This commit shrinks the srcu_struct structure by converting its ->node
field from a fixed-size compile-time array to a pointer to a dynamically
allocated array.  In kernels built with large values of NR_CPUS that boot
on systems with smaller numbers of CPUs, this can save significant memory.

[ paulmck: Apply kernel test robot feedback. ]

Reported-by: A cast of thousands
Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 65 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e23696edd43b..e98cc218e42b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/srcu.h>
 
 #include "rcu.h"
@@ -75,12 +76,42 @@ do {									\
 	spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)	\
 
 /*
- * Initialize SRCU combining tree.  Note that statically allocated
+ * Initialize SRCU per-CPU data.  Note that statically allocated
  * srcu_struct structures might already have srcu_read_lock() and
  * srcu_read_unlock() running against them.  So if the is_static parameter
  * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
  */
-static void init_srcu_struct_nodes(struct srcu_struct *ssp)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+	int cpu;
+	struct srcu_data *sdp;
+
+	/*
+	 * Initialize the per-CPU srcu_data array, which feeds into the
+	 * leaves of the srcu_node tree.
+	 */
+	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+		     ARRAY_SIZE(sdp->srcu_unlock_count));
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(ssp->sda, cpu);
+		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+		rcu_segcblist_init(&sdp->srcu_cblist);
+		sdp->srcu_cblist_invoking = false;
+		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+		sdp->mynode = NULL;
+		sdp->cpu = cpu;
+		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+		timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+		sdp->ssp = ssp;
+	}
+}
+
+/*
+ * Allocated and initialize SRCU combining tree.  Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp)
 {
 	int cpu;
 	int i;
@@ -92,6 +123,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 
 	/* Initialize geometry if it has not already been initialized. */
 	rcu_init_geometry();
+	ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC);
+	if (!ssp->node)
+		return false;
 
 	/* Work out the overall tree geometry. */
 	ssp->level[0] = &ssp->node[0];
@@ -129,30 +163,20 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 	 * Initialize the per-CPU srcu_data array, which feeds into the
 	 * leaves of the srcu_node tree.
 	 */
-	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
-		     ARRAY_SIZE(sdp->srcu_unlock_count));
 	level = rcu_num_lvls - 1;
 	snp_first = ssp->level[level];
 	for_each_possible_cpu(cpu) {
 		sdp = per_cpu_ptr(ssp->sda, cpu);
-		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
-		rcu_segcblist_init(&sdp->srcu_cblist);
-		sdp->srcu_cblist_invoking = false;
-		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
-		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
 		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
 			if (snp->grplo < 0)
 				snp->grplo = cpu;
 			snp->grphi = cpu;
 		}
-		sdp->cpu = cpu;
-		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
-		timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
-		sdp->ssp = ssp;
 		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 	}
 	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+	return true;
 }
 
 /*
@@ -163,6 +187,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 {
 	ssp->srcu_size_state = SRCU_SIZE_SMALL;
+	ssp->node = NULL;
 	mutex_init(&ssp->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
@@ -175,8 +200,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
 	if (!ssp->sda)
 		return -ENOMEM;
-	init_srcu_struct_nodes(ssp);
-	ssp->srcu_size_state = SRCU_SIZE_BIG;
+	init_srcu_struct_data(ssp);
+	if (!init_srcu_struct_nodes(ssp)) {
+		if (!is_static) {
+			free_percpu(ssp->sda);
+			ssp->sda = NULL;
+			return -ENOMEM;
+		}
+	} else {
+		ssp->srcu_size_state = SRCU_SIZE_BIG;
+	}
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
@@ -393,6 +426,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 	}
 	free_percpu(ssp->sda);
 	ssp->sda = NULL;
+	kfree(ssp->node);
+	ssp->node = NULL;
 	ssp->srcu_size_state = SRCU_SIZE_SMALL;
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-- 
cgit v1.2.3


From e2f638365dd6283b7df1cb5e82f5b2746359f062 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 24 Jan 2022 15:41:32 -0800
Subject: srcu: Add size-state transitioning code

This is just dead code at the moment, and will be used once
the state-transition code is activated.

Because srcu_barrier() must be aware of transition before call_srcu(), the
state machine waits for an SRCU grace period before callbacks are queued
to the non-CPU-0 queues.  This requres that portions of srcu_barrier()
be enclosed in an SRCU read-side critical section.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e98cc218e42b..00f9aed6f7b9 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -562,6 +562,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	unsigned long mask;
 	struct srcu_data *sdp;
 	struct srcu_node *snp;
+	int ss_state;
 
 	/* Prevent more than one additional grace period. */
 	mutex_lock(&ssp->srcu_cb_mutex);
@@ -629,6 +630,15 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	} else {
 		spin_unlock_irq_rcu_node(ssp);
 	}
+
+	/* Transition to big if needed. */
+	ss_state = smp_load_acquire(&ssp->srcu_size_state);
+	if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+		if (ss_state == SRCU_SIZE_ALLOC)
+			init_srcu_struct_nodes(ssp);
+		else
+			smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+	}
 }
 
 /*
@@ -1178,6 +1188,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
 void srcu_barrier(struct srcu_struct *ssp)
 {
 	int cpu;
+	int idx;
 	unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
 
 	check_init_srcu_struct(ssp);
@@ -1193,11 +1204,13 @@ void srcu_barrier(struct srcu_struct *ssp)
 	/* Initial count prevents reaching zero until all CBs are posted. */
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
 
+	idx = srcu_read_lock(ssp);
 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
 		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0));
 	else
 		for_each_possible_cpu(cpu)
 			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+	srcu_read_unlock(ssp, idx);
 
 	/* Remove the initial count, at which point reaching zero can happen. */
 	if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
-- 
cgit v1.2.3


From 3bedebcf63c2ad7396f1be138bbef91a402f33cd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 24 Jan 2022 17:05:51 -0800
Subject: srcu: Make rcutorture dump the SRCU size state

This commit adds the numeric and string version of ->srcu_size_state to
the Tree-SRCU-specific portion of the rcutorture output.

[ paulmck: Apply feedback from kernel test robot and Dan Carpenter. ]
[ quic_neeraju: Apply feedback from Jiapeng Chong. ]

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 00f9aed6f7b9..0874e3be6a5a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1407,15 +1407,33 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 
+static const char * const srcu_size_state_name[] = {
+	"SRCU_SIZE_SMALL",
+	"SRCU_SIZE_ALLOC",
+	"SRCU_SIZE_WAIT_BARRIER",
+	"SRCU_SIZE_WAIT_CALL",
+	"SRCU_SIZE_WAIT_CBS1",
+	"SRCU_SIZE_WAIT_CBS2",
+	"SRCU_SIZE_WAIT_CBS3",
+	"SRCU_SIZE_WAIT_CBS4",
+	"SRCU_SIZE_BIG",
+	"SRCU_SIZE_???",
+};
+
 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 {
 	int cpu;
 	int idx;
 	unsigned long s0 = 0, s1 = 0;
+	int ss_state = READ_ONCE(ssp->srcu_size_state);
+	int ss_state_idx = ss_state;
 
 	idx = ssp->srcu_idx & 0x1;
-	pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
-		 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
+	if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+		ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+	pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):",
+		 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+		 srcu_size_state_name[ss_state_idx], idx);
 	for_each_possible_cpu(cpu) {
 		unsigned long l0, l1;
 		unsigned long u0, u1;
-- 
cgit v1.2.3


From aeb9b39b8f4aac1233302c53a1fd99a73fd2c262 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 26 Jan 2022 16:01:26 -0800
Subject: srcu: Compute snp_seq earlier in srcu_funnel_gp_start()

Currently, srcu_funnel_gp_start() tests snp->srcu_have_cbs[idx] and then
separately assigns it to the snp_seq local variable.  This commit does
the assignment earlier to simplify the code a bit.  While in the area,
this commit also takes advantage of the 100-character line limit to put
the call to srcu_schedule_cbs_sdp() on a single line.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0874e3be6a5a..fec608f69962 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -697,15 +697,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 			if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf)
 				return; /* GP already done and CBs recorded. */
 			spin_lock_irqsave_rcu_node(snp, flags);
-			if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
-				snp_seq = snp->srcu_have_cbs[idx];
+			snp_seq = snp->srcu_have_cbs[idx];
+			if (ULONG_CMP_GE(snp_seq, s)) {
 				if (snp == snp_leaf && snp_seq == s)
 					snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 				spin_unlock_irqrestore_rcu_node(snp, flags);
 				if (snp == snp_leaf && snp_seq != s) {
-					srcu_schedule_cbs_sdp(sdp, do_norm
-								   ? SRCU_INTERVAL
-								   : 0);
+					srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
 					return;
 				}
 				if (!do_norm)
-- 
cgit v1.2.3


From cbdc98e93efa7bbf6f2fcd68c73df82c37b5fa65 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 26 Jan 2022 17:03:06 -0800
Subject: srcu: Use invalid initial value for srcu_node GP sequence numbers

Currently, tree SRCU relies on the srcu_node structures being initialized
at the same time that the srcu_struct itself is initialized, and thus
use the initial grace-period sequence number as the initial value for
the srcu_node structure's ->srcu_have_cbs[] and ->srcu_gp_seq_needed_exp
fields.  Although this has a high probability of also working when the
srcu_node array is allocated and initialized at some random later time,
it would be better to avoid leaving such things to chance.

This commit therefore initializes these fields with 0x2, which is a
recognizable invalid value.  It then adds the required checks for this
invalid value in order to avoid confusion on long-running kernels
(especially those on 32-bit systems) that allocate and initialize
srcu_node arrays late in life.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fec608f69962..155c430c6a73 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -107,6 +107,18 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
 	}
 }
 
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ		0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+	return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ;
+}
+
 /*
  * Allocated and initialize SRCU combining tree.  Returns @true if
  * allocation succeeded and @false otherwise.
@@ -139,10 +151,10 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp)
 		WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
 			     ARRAY_SIZE(snp->srcu_data_have_cbs));
 		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
-			snp->srcu_have_cbs[i] = 0;
+			snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
 			snp->srcu_data_have_cbs[i] = 0;
 		}
-		snp->srcu_gp_seq_needed_exp = 0;
+		snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
 		snp->grplo = -1;
 		snp->grphi = -1;
 		if (snp == &ssp->node[0]) {
@@ -386,8 +398,7 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
  */
 static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 {
-	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
-			 READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
 		return 0;
 	return SRCU_INTERVAL;
 }
@@ -561,6 +572,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	int idx;
 	unsigned long mask;
 	struct srcu_data *sdp;
+	unsigned long sgsne;
 	struct srcu_node *snp;
 	int ss_state;
 
@@ -594,7 +606,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 				cbs = snp->srcu_have_cbs[idx] == gpseq;
 			snp->srcu_have_cbs[idx] = gpseq;
 			rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
-			if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+			sgsne = snp->srcu_gp_seq_needed_exp;
+			if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
 				WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
 			mask = snp->srcu_data_have_cbs[idx];
 			snp->srcu_data_have_cbs[idx] = 0;
@@ -652,14 +665,17 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 				  unsigned long s)
 {
 	unsigned long flags;
+	unsigned long sgsne;
 
 	if (snp)
 		for (; snp != NULL; snp = snp->srcu_parent) {
+			sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
 			if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
-			    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+			    (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
 				return;
 			spin_lock_irqsave_rcu_node(snp, flags);
-			if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+			sgsne = snp->srcu_gp_seq_needed_exp;
+			if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
 				spin_unlock_irqrestore_rcu_node(snp, flags);
 				return;
 			}
@@ -687,6 +703,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 {
 	unsigned long flags;
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+	unsigned long sgsne;
 	struct srcu_node *snp;
 	struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode);
 	unsigned long snp_seq;
@@ -698,7 +715,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 				return; /* GP already done and CBs recorded. */
 			spin_lock_irqsave_rcu_node(snp, flags);
 			snp_seq = snp->srcu_have_cbs[idx];
-			if (ULONG_CMP_GE(snp_seq, s)) {
+			if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
 				if (snp == snp_leaf && snp_seq == s)
 					snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 				spin_unlock_irqrestore_rcu_node(snp, flags);
@@ -713,7 +730,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 			snp->srcu_have_cbs[idx] = s;
 			if (snp == snp_leaf)
 				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
-			if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+			sgsne = snp->srcu_gp_seq_needed_exp;
+			if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
 				WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
 			spin_unlock_irqrestore_rcu_node(snp, flags);
 		}
-- 
cgit v1.2.3


From 0b56f953908a751716f2c8f907942674b60d8db5 Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Date: Tue, 22 Feb 2022 11:39:01 +0530
Subject: srcu: Ensure snp nodes tree is fully initialized before traversal

For configurations where snp node tree is not initialized at
init time (added in subsequent commits), srcu_funnel_gp_start()
and srcu_funnel_exp_start() can potential traverse and observe
the snp nodes' transient (uninitialized) states. This can potentially
happen, when init_srcu_struct_nodes() initialization of sdp->mynode
races with srcu_funnel_gp_start() and srcu_funnel_exp_start()

Consider the case below where srcu_funnel_gp_start() observes
sdp->mynode to be not NULL and uses an uninitialized sdp->grpmask

          P1                                  P2

init_srcu_struct_nodes()           void srcu_funnel_gp_start(...)
{
for_each_possible_cpu(cpu) {
  ...
  sdp->mynode = &snp_first[...];
  for (snp = sdp->mynode;...)       struct srcu_node *snp_leaf =
                                       smp_load_acquire(&sdp->mynode)
    ...                             if (snp_leaf) {
                                      for (snp = snp_leaf; ...)
                                        ...
					if (snp == snp_leaf)
                                         snp->srcu_data_have_cbs[idx] |=
                                           sdp->grpmask;
    sdp->grpmask =
      1 << (cpu - sdp->mynode->grplo);
  }
}

Similarly, init_srcu_struct_nodes() and srcu_funnel_exp_start() can
race, where srcu_funnel_exp_start() could observe state of snp lock
before spin_lock_init().

          P1                                      P2

init_srcu_struct_nodes()               void srcu_funnel_exp_start(...)
{
  srcu_for_each_node_breadth_first(ssp, snp) {      for (; ...) {
                                                      spin_lock_...(snp, )
	spin_lock_init(&ACCESS_PRIVATE(snp, lock));
    ...
  }
  for_each_possible_cpu(cpu) {
    ...
    sdp->mynode = &snp_first[...];

To avoid these issues, ensure that snp node tree initialization is
complete i.e. after SRCU_SIZE_WAIT_BARRIER srcu_size_state is reached,
before traversing the tree. Given that srcu_funnel_gp_start() and
srcu_funnel_exp_start() are called within SRCU read side critical
sections, this check is safe, in the sense that all callbacks are
enqueued on CPU0 srcu_cblist until SRCU_SIZE_WAIT_CALL is entered,
and these read side critical sections (containing srcu_funnel_gp_start()
and srcu_funnel_exp_start()) need to complete, before SRCU_SIZE_WAIT_CALL
is reached.

Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 155c430c6a73..2e7ed67646db 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -705,9 +705,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
 	unsigned long sgsne;
 	struct srcu_node *snp;
-	struct srcu_node *snp_leaf = smp_load_acquire(&sdp->mynode);
+	struct srcu_node *snp_leaf;
 	unsigned long snp_seq;
 
+	/* Ensure that snp node tree is fully initialized before traversing it */
+	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+		snp_leaf = NULL;
+	else
+		snp_leaf = sdp->mynode;
+
 	if (snp_leaf)
 		/* Each pass through the loop does one level of the srcu_node tree. */
 		for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
@@ -889,10 +895,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	bool needgp = false;
 	unsigned long s;
 	struct srcu_data *sdp;
+	struct srcu_node *sdp_mynode;
+	int ss_state;
 
 	check_init_srcu_struct(ssp);
 	idx = srcu_read_lock(ssp);
-	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_CALL)
+	ss_state = smp_load_acquire(&ssp->srcu_size_state);
+	if (ss_state < SRCU_SIZE_WAIT_CALL)
 		sdp = per_cpu_ptr(ssp->sda, 0);
 	else
 		sdp = raw_cpu_ptr(ssp->sda);
@@ -912,10 +921,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 		needexp = true;
 	}
 	spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+	/* Ensure that snp node tree is fully initialized before traversing it */
+	if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+		sdp_mynode = NULL;
+	else
+		sdp_mynode = sdp->mynode;
+
 	if (needgp)
 		srcu_funnel_gp_start(ssp, sdp, s, do_norm);
 	else if (needexp)
-		srcu_funnel_exp_start(ssp, smp_load_acquire(&sdp->mynode), s);
+		srcu_funnel_exp_start(ssp, sdp_mynode, s);
 	srcu_read_unlock(ssp, idx);
 	return s;
 }
-- 
cgit v1.2.3


From c69a00a12e26cf4faffdcdb340cb2d059b61d57e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 25 Jan 2022 15:41:10 -0800
Subject: srcu: Add boot-time control over srcu_node array allocation

This commit adds an srcu_tree.convert_to_big kernel parameter that either
refuses to convert at all (0), converts immediately at init_srcu_struct()
time (1), or lets rcutorture convert it (2).  An addition contention-based
dynamic conversion choice will be added, along with documentation.

[ paulmck: Apply callback-scanning feedback from Neeraj Upadhyay. ]

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2e7ed67646db..9dd048989027 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -39,6 +39,15 @@ module_param(exp_holdoff, ulong, 0444);
 static ulong counter_wrap_check = (ULONG_MAX >> 2);
 module_param(counter_wrap_check, ulong, 0444);
 
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all (default).
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ */
+static int convert_to_big;
+module_param(convert_to_big, int, 0444);
+
 /* Early-boot callback-management, so early that no lock is required! */
 static LIST_HEAD(srcu_boot_list);
 static bool __read_mostly srcu_init_done;
@@ -123,7 +132,7 @@ static inline bool srcu_invl_snp_seq(unsigned long s)
  * Allocated and initialize SRCU combining tree.  Returns @true if
  * allocation succeeded and @false otherwise.
  */
-static bool init_srcu_struct_nodes(struct srcu_struct *ssp)
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 {
 	int cpu;
 	int i;
@@ -135,7 +144,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp)
 
 	/* Initialize geometry if it has not already been initialized. */
 	rcu_init_geometry();
-	ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC);
+	ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
 	if (!ssp->node)
 		return false;
 
@@ -213,17 +222,19 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	if (!ssp->sda)
 		return -ENOMEM;
 	init_srcu_struct_data(ssp);
-	if (!init_srcu_struct_nodes(ssp)) {
-		if (!is_static) {
-			free_percpu(ssp->sda);
-			ssp->sda = NULL;
-			return -ENOMEM;
-		}
-	} else {
-		ssp->srcu_size_state = SRCU_SIZE_BIG;
-	}
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
+	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) {
+		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
+			if (!is_static) {
+				free_percpu(ssp->sda);
+				ssp->sda = NULL;
+				return -ENOMEM;
+			}
+		} else {
+			WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+		}
+	}
 	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
 	return 0;
 }
@@ -594,7 +605,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	/* A new grace period can start at this point.  But only one. */
 
 	/* Initiate callback invocation as needed. */
-	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) {
+	ss_state = smp_load_acquire(&ssp->srcu_size_state);
+	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
 		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay);
 	} else {
 		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
@@ -603,13 +615,16 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 			cbs = false;
 			last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
 			if (last_lvl)
-				cbs = snp->srcu_have_cbs[idx] == gpseq;
+				cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
 			snp->srcu_have_cbs[idx] = gpseq;
 			rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
 			sgsne = snp->srcu_gp_seq_needed_exp;
 			if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
 				WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
-			mask = snp->srcu_data_have_cbs[idx];
+			if (ss_state < SRCU_SIZE_BIG)
+				mask = ~0;
+			else
+				mask = snp->srcu_data_have_cbs[idx];
 			snp->srcu_data_have_cbs[idx] = 0;
 			spin_unlock_irq_rcu_node(snp);
 			if (cbs)
@@ -645,10 +660,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	}
 
 	/* Transition to big if needed. */
-	ss_state = smp_load_acquire(&ssp->srcu_size_state);
 	if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
 		if (ss_state == SRCU_SIZE_ALLOC)
-			init_srcu_struct_nodes(ssp);
+			init_srcu_struct_nodes(ssp, GFP_KERNEL);
 		else
 			smp_store_release(&ssp->srcu_size_state, ss_state + 1);
 	}
@@ -1494,6 +1508,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 		s1 += c1;
 	}
 	pr_cont(" T(%ld,%ld)\n", s0, s1);
+	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2)
+		WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC);
 }
 EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
 
-- 
cgit v1.2.3


From 4a230f8046454df18139ed1232f1a1e8a6dd36c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 11:43:11 -0800
Subject: srcu: Avoid NULL dereference in srcu_torture_stats_print()

You really shouldn't invoke srcu_torture_stats_print() after invoking
cleanup_srcu_struct(), but there is really no reason to get a
compiler-obfuscated per-CPU-variable NULL pointer dereference as the
diagnostic.  This commit therefore checks for NULL ->sda and makes a
more polite console-message complaint in that case.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 62 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9dd048989027..b7138dbe1a2d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1477,37 +1477,43 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 	idx = ssp->srcu_idx & 0x1;
 	if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
 		ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
-	pr_alert("%s%s Tree SRCU g%ld state %d (%s) per-CPU(idx=%d):",
+	pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
 		 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
-		 srcu_size_state_name[ss_state_idx], idx);
-	for_each_possible_cpu(cpu) {
-		unsigned long l0, l1;
-		unsigned long u0, u1;
-		long c0, c1;
-		struct srcu_data *sdp;
-
-		sdp = per_cpu_ptr(ssp->sda, cpu);
-		u0 = data_race(sdp->srcu_unlock_count[!idx]);
-		u1 = data_race(sdp->srcu_unlock_count[idx]);
+		 srcu_size_state_name[ss_state_idx]);
+	if (!ssp->sda) {
+		// Called after cleanup_srcu_struct(), perhaps.
+		pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+	} else {
+		pr_cont(" per-CPU(idx=%d):", idx);
+		for_each_possible_cpu(cpu) {
+			unsigned long l0, l1;
+			unsigned long u0, u1;
+			long c0, c1;
+			struct srcu_data *sdp;
 
-		/*
-		 * Make sure that a lock is always counted if the corresponding
-		 * unlock is counted.
-		 */
-		smp_rmb();
-
-		l0 = data_race(sdp->srcu_lock_count[!idx]);
-		l1 = data_race(sdp->srcu_lock_count[idx]);
-
-		c0 = l0 - u0;
-		c1 = l1 - u1;
-		pr_cont(" %d(%ld,%ld %c)",
-			cpu, c0, c1,
-			"C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
-		s0 += c0;
-		s1 += c1;
+			sdp = per_cpu_ptr(ssp->sda, cpu);
+			u0 = data_race(sdp->srcu_unlock_count[!idx]);
+			u1 = data_race(sdp->srcu_unlock_count[idx]);
+
+			/*
+			 * Make sure that a lock is always counted if the corresponding
+			 * unlock is counted.
+			 */
+			smp_rmb();
+
+			l0 = data_race(sdp->srcu_lock_count[!idx]);
+			l1 = data_race(sdp->srcu_lock_count[idx]);
+
+			c0 = l0 - u0;
+			c1 = l1 - u1;
+			pr_cont(" %d(%ld,%ld %c)",
+				cpu, c0, c1,
+				"C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+			s0 += c0;
+			s1 += c1;
+		}
+		pr_cont(" T(%ld,%ld)\n", s0, s1);
 	}
-	pr_cont(" T(%ld,%ld)\n", s0, s1);
 	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2)
 		WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC);
 }
-- 
cgit v1.2.3


From 46470cf85d2b61abd37c6f66c4dacc1bc510d10f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 13:20:49 -0800
Subject: srcu: Prevent cleanup_srcu_struct() from freeing non-dynamic ->sda

When an srcu_struct structure is created (but not in a kernel module)
by DEFINE_SRCU() and friends, the per-CPU srcu_data structure is
statically allocated.  In all other cases, that structure is obtained
from alloc_percpu(), in which case cleanup_srcu_struct() must invoke
free_percpu() on the resulting ->sda pointer in the srcu_struct pointer.

Which it does.

Except that it also invokes free_percpu() on the ->sda pointer
referencing the statically allocated per-CPU srcu_data structures.
Which free_percpu() is surprisingly OK with.

This commit nevertheless stops cleanup_srcu_struct() from freeing
statically allocated per-CPU srcu_data structures.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b7138dbe1a2d..7209fd95dde9 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -217,6 +217,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	mutex_init(&ssp->srcu_barrier_mutex);
 	atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&ssp->work, process_srcu);
+	ssp->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
 	if (!ssp->sda)
@@ -226,7 +227,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
-			if (!is_static) {
+			if (!ssp->sda_is_static) {
 				free_percpu(ssp->sda);
 				ssp->sda = NULL;
 				return -ENOMEM;
@@ -446,8 +447,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
 			rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
-	free_percpu(ssp->sda);
-	ssp->sda = NULL;
+	if (!ssp->sda_is_static) {
+		free_percpu(ssp->sda);
+		ssp->sda = NULL;
+	}
 	kfree(ssp->node);
 	ssp->node = NULL;
 	ssp->srcu_size_state = SRCU_SIZE_SMALL;
-- 
cgit v1.2.3


From ee5e2448bceb9400aa27207f0c0220f9dedd85eb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 13:47:42 -0800
Subject: srcu: Explain srcu_funnel_gp_start() call to list_add() is safe

This commit adds a comment explaining why an unprotected call to
list_add() from srcu_funnel_gp_start() can be safe.  TL;DR: It is only
called during very early boot when we don't have no steeking concurrency!

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7209fd95dde9..64993a172cff 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -776,6 +776,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 	    rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
 		WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
 		srcu_gp_start(ssp);
+
+		// And how can that list_add() in the "else" clause
+		// possibly be safe for concurrent execution?  Well,
+		// it isn't.  And it does not have to be.  After all, it
+		// can only be executed during early boot when there is only
+		// the one boot CPU running with interrupts still disabled.
 		if (likely(srcu_init_done))
 			queue_delayed_work(rcu_gp_wq, &ssp->work,
 					   srcu_get_delay(ssp));
-- 
cgit v1.2.3


From 99659f64b14e55cfa48980f5396f83820bafd028 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 14:56:39 -0800
Subject: srcu: Create concurrency-safe helper for initiating size transition

Once there are contention-initiated size transitions, it will be
possible for rcutorture to initiate a transition at the same time
as a contention-initiated transition.  This commit therefore creates
a concurrency-safe helper function named srcu_transition_to_big() to
safely initiate size transitions.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 64993a172cff..c9460374d437 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -272,6 +272,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+	unsigned long flags;
+
+	/* Double-checked locking on ->srcu_size-state. */
+	if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+		return;
+	spin_lock_irqsave_rcu_node(ssp, flags);
+	if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
+		spin_unlock_irqrestore_rcu_node(ssp, flags);
+		return;
+	}
+	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+	spin_unlock_irqrestore_rcu_node(ssp, flags);
+}
+
 /*
  * First-use initialization of statically allocated srcu_struct
  * structure.  Wiring up the combining tree is more than can be
@@ -1523,8 +1542,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 		}
 		pr_cont(" T(%ld,%ld)\n", s0, s1);
 	}
-	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 2)
-		WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+	if (convert_to_big == 2)
+		srcu_transition_to_big(ssp);
 }
 EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
 
-- 
cgit v1.2.3


From 9f2e91d94c91558e3764fe4e01c5e6281a90f239 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jan 2022 20:32:05 -0800
Subject: srcu: Add contention-triggered addition of srcu_node tree

This commit instruments the acquisitions of the srcu_struct structure's
->lock, enabling the initiation of a transition from SRCU_SIZE_SMALL
to SRCU_SIZE_BIG when sufficient contention is experienced.  The
instrumentation counts the number of trylock failures within the confines
of a single jiffy.  If that number exceeds the value specified by the
srcutree.small_contention_lim kernel boot parameter (which defaults to
100), and if the value specified by the srcutree.convert_to_big kernel
boot parameter has the 0x10 bit set (defaults to 0), then a transition
will be automatically initiated.

By default, there will never be any transitions, so that none of the
srcu_struct structures ever gains an srcu_node array.

The useful values for srcutree.convert_to_big are:

0x00:  Never convert.
0x01:  Always convert at init_srcu_struct() time.
0x02:  Convert when rcutorture prints its first round of statistics.
0x03:  Decide conversion approach at boot given system size.
0x10:  Convert if contention is encountered.
0x12:  Convert if contention is encountered or when rcutorture prints
        its first round of statistics, whichever comes first.

The value 0x11 acts the same as 0x01 because the conversion happens
before there is any chance of contention.

[ paulmck: Apply "static" feedback from kernel test robot. ]

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 107 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c9460374d437..0bc6a0a3edee 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -41,13 +41,29 @@ module_param(counter_wrap_check, ulong, 0444);
 
 /*
  * Control conversion to SRCU_SIZE_BIG:
- * 0: Don't convert at all (default).
- * 1: Convert at init_srcu_struct() time.
- * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ *    0: Don't convert at all (default).
+ *    1: Convert at init_srcu_struct() time.
+ *    2: Convert when rcutorture invokes srcu_torture_stats_print().
+ *    3: Decide at boot time based on system shape.
+ * 0x1x: Convert when excessive contention encountered.
  */
-static int convert_to_big;
+#define SRCU_SIZING_NONE	0
+#define SRCU_SIZING_INIT	1
+#define SRCU_SIZING_TORTURE	2
+#define SRCU_SIZING_AUTO	3
+#define SRCU_SIZING_CONTEND	0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_NONE;
 module_param(convert_to_big, int, 0444);
 
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
 /* Early-boot callback-management, so early that no lock is required! */
 static LIST_HEAD(srcu_boot_list);
 static bool __read_mostly srcu_init_done;
@@ -58,31 +74,40 @@ static void process_srcu(struct work_struct *work);
 static void srcu_delay_timer(struct timer_list *t);
 
 /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p)					\
-do {									\
-	spin_lock(&ACCESS_PRIVATE(p, lock));			\
-	smp_mb__after_unlock_lock();					\
+#define spin_lock_rcu_node(p)							\
+do {										\
+	spin_lock(&ACCESS_PRIVATE(p, lock));					\
+	smp_mb__after_unlock_lock();						\
 } while (0)
 
 #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
 
-#define spin_lock_irq_rcu_node(p)					\
-do {									\
-	spin_lock_irq(&ACCESS_PRIVATE(p, lock));			\
-	smp_mb__after_unlock_lock();					\
+#define spin_lock_irq_rcu_node(p)						\
+do {										\
+	spin_lock_irq(&ACCESS_PRIVATE(p, lock));				\
+	smp_mb__after_unlock_lock();						\
 } while (0)
 
-#define spin_unlock_irq_rcu_node(p)					\
+#define spin_unlock_irq_rcu_node(p)						\
 	spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
 
-#define spin_lock_irqsave_rcu_node(p, flags)			\
-do {									\
-	spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags);	\
-	smp_mb__after_unlock_lock();					\
+#define spin_lock_irqsave_rcu_node(p, flags)					\
+do {										\
+	spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags);			\
+	smp_mb__after_unlock_lock();						\
 } while (0)
 
-#define spin_unlock_irqrestore_rcu_node(p, flags)			\
-	spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)	\
+#define spin_trylock_irqsave_rcu_node(p, flags)					\
+({										\
+	bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags);	\
+										\
+	if (___locked)								\
+		smp_mb__after_unlock_lock();					\
+	___locked;								\
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags)				\
+	spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)			\
 
 /*
  * Initialize SRCU per-CPU data.  Note that statically allocated
@@ -225,7 +250,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	init_srcu_struct_data(ssp);
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
-	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && convert_to_big == 1) {
+	if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
 		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
 			if (!ssp->sda_is_static) {
 				free_percpu(ssp->sda);
@@ -272,6 +297,15 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
+	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
 /*
  * Initiate an idempotent transition to SRCU_SIZE_BIG.
  */
@@ -287,10 +321,35 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
 		spin_unlock_irqrestore_rcu_node(ssp, flags);
 		return;
 	}
-	smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+	__srcu_transition_to_big(ssp);
 	spin_unlock_irqrestore_rcu_node(ssp, flags);
 }
 
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG.  But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+	unsigned long j;
+
+	if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+		return;
+	spin_lock_irqsave_rcu_node(ssp, *flags);
+	if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+		return;
+	j = jiffies;
+	if (ssp->srcu_size_jiffies != j) {
+		ssp->srcu_size_jiffies = j;
+		ssp->srcu_n_lock_retries = 0;
+	}
+	if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+		return;
+	__srcu_transition_to_big(ssp);
+}
+
 /*
  * First-use initialization of statically allocated srcu_struct
  * structure.  Wiring up the combining tree is more than can be
@@ -718,7 +777,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
 			WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
 			spin_unlock_irqrestore_rcu_node(snp, flags);
 		}
-	spin_lock_irqsave_rcu_node(ssp, flags);
+	spin_lock_irqsave_ssp_contention(ssp, &flags);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
 		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
 	spin_unlock_irqrestore_rcu_node(ssp, flags);
@@ -779,7 +838,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 		}
 
 	/* Top of tree, must ensure the grace period will be started. */
-	spin_lock_irqsave_rcu_node(ssp, flags);
+	spin_lock_irqsave_ssp_contention(ssp, &flags);
 	if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
 		/*
 		 * Record need for grace period s.  Pair with load
@@ -1542,7 +1601,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
 		}
 		pr_cont(" T(%ld,%ld)\n", s0, s1);
 	}
-	if (convert_to_big == 2)
+	if (SRCU_SIZING_IS_TORTURE())
 		srcu_transition_to_big(ssp);
 }
 EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
-- 
cgit v1.2.3


From beb84099f1cf51e005e5df77d05b1644e490409e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 16 Feb 2022 16:42:04 +0100
Subject: rcu: Remove rcu_is_nocb_cpu()

The rcu_is_nocb_cpu() function is no longer used, so this commmit
removes it.

Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h       | 2 --
 kernel/rcu/tree_nocb.h | 8 --------
 2 files changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..4c53a2ccf711 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -538,10 +538,8 @@ extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
-bool rcu_is_nocb_cpu(int cpu);
 void rcu_bind_current_to_nocb(void);
 #else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
 static inline void rcu_bind_current_to_nocb(void) { }
 #endif
 
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 636d0546a4e9..02e1d05a11fc 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -215,14 +215,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
-	if (cpumask_available(rcu_nocb_mask))
-		return cpumask_test_cpu(cpu, rcu_nocb_mask);
-	return false;
-}
-
 static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
 			   struct rcu_data *rdp,
 			   bool force, unsigned long flags)
-- 
cgit v1.2.3


From 8d2aaa9b7c290e766a41f29c71ec72192851d538 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 14 Feb 2022 14:23:39 +0100
Subject: rcu/nocb: Move rcu_nocb_is_setup to rcu_state

This commit moves the RCU nocb initialization witness within rcu_state
to consolidate RCU's global state.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h      |  1 +
 kernel/rcu/tree_nocb.h | 13 +++++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 926673ebe355..f6a3d54585c9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -364,6 +364,7 @@ struct rcu_state {
 	arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
 						/* Synchronize offline with */
 						/*  GP pre-initialization. */
+	int nocb_is_setup;			/* nocb is setup from boot */
 };
 
 /* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 02e1d05a11fc..3c00240833d6 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
  * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
  * If the list is invalid, a warning is emitted and all CPUs are offloaded.
  */
-
-static bool rcu_nocb_is_setup;
-
 static int __init rcu_nocb_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
@@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str)
 			cpumask_setall(rcu_nocb_mask);
 		}
 	}
-	rcu_nocb_is_setup = true;
+	rcu_state.nocb_is_setup = true;
 	return 1;
 }
 __setup("rcu_nocbs", rcu_nocb_setup);
@@ -1172,10 +1169,10 @@ void __init rcu_init_nohz(void)
 				return;
 			}
 		}
-		rcu_nocb_is_setup = true;
+		rcu_state.nocb_is_setup = true;
 	}
 
-	if (!rcu_nocb_is_setup)
+	if (!rcu_state.nocb_is_setup)
 		return;
 
 #if defined(CONFIG_NO_HZ_FULL)
@@ -1233,7 +1230,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	struct task_struct *t;
 	struct sched_param sp;
 
-	if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
+	if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
 		return;
 
 	/* If there already is an rcuo kthread, then nothing to do. */
@@ -1279,7 +1276,7 @@ static void __init rcu_spawn_nocb_kthreads(void)
 {
 	int cpu;
 
-	if (rcu_nocb_is_setup) {
+	if (rcu_state.nocb_is_setup) {
 		for_each_online_cpu(cpu)
 			rcu_spawn_cpu_nocb_kthread(cpu);
 	}
-- 
cgit v1.2.3


From 2eed973adc6e749439730e53e6220b122398d319 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 16 Feb 2022 16:42:06 +0100
Subject: rcu: Assume rcu_init() is called before smp

The rcu_init() function is called way before SMP is initialized and
therefore only the boot CPU should be online at this stage.

Simplify the boot per-cpu initialization accordingly.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..e6a9e5744e45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4782,7 +4782,7 @@ static void __init kfree_rcu_batch_init(void)
 
 void __init rcu_init(void)
 {
-	int cpu;
+	int cpu = smp_processor_id();
 
 	rcu_early_boot_tests();
 
@@ -4802,11 +4802,10 @@ void __init rcu_init(void)
 	 * or the scheduler are operational.
 	 */
 	pm_notifier(rcu_pm_notify, 0);
-	for_each_online_cpu(cpu) {
-		rcutree_prepare_cpu(cpu);
-		rcu_cpu_starting(cpu);
-		rcutree_online_cpu(cpu);
-	}
+	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+	rcutree_prepare_cpu(cpu);
+	rcu_cpu_starting(cpu);
+	rcutree_online_cpu(cpu);
 
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
-- 
cgit v1.2.3


From 3352911fa9b47a90165e5c6fed440048c55146d1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 16 Feb 2022 16:42:07 +0100
Subject: rcu: Initialize boost kthread only for boot node prior SMP
 initialization

The rcu_spawn_gp_kthread() function is called as an early initcall,
which means that SMP initialization hasn't happened yet and only the
boot CPU is online.  Therefore, create only the boost kthread for the
leaf node of the boot CPU.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        |  5 ++++-
 kernel/rcu/tree.h        |  1 -
 kernel/rcu/tree_plugin.h | 16 ----------------
 3 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e6a9e5744e45..70b33c55d39a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4480,6 +4480,7 @@ static int __init rcu_spawn_gp_kthread(void)
 	struct rcu_node *rnp;
 	struct sched_param sp;
 	struct task_struct *t;
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 
 	rcu_scheduler_fully_active = 1;
 	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@ -4498,7 +4499,9 @@ static int __init rcu_spawn_gp_kthread(void)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	wake_up_process(t);
 	rcu_spawn_nocb_kthreads();
-	rcu_spawn_boost_kthreads();
+	/* This is a pre-SMP initcall, we expect a single CPU */
+	WARN_ON(num_online_cpus() > 1);
+	rcu_spawn_one_boost_kthread(rdp->mynode);
 	rcu_spawn_core_kthreads();
 	return 0;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f6a3d54585c9..b2a0f2613ab9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -422,7 +422,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static bool rcu_is_callbacks_kthread(void);
 static void rcu_cpu_kthread_setup(unsigned int cpu);
 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
-static void __init rcu_spawn_boost_kthreads(void);
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void rcu_preempt_deferred_qs(struct task_struct *t);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8360d86db1c0..b139635f33bd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1226,18 +1226,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 	free_cpumask_var(cm);
 }
 
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
-	struct rcu_node *rnp;
-
-	rcu_for_each_leaf_node(rnp)
-		if (rcu_rnp_online_cpus(rnp))
-			rcu_spawn_one_boost_kthread(rnp);
-}
-
 #else /* #ifdef CONFIG_RCU_BOOST */
 
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1263,10 +1251,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 }
 
-static void __init rcu_spawn_boost_kthreads(void)
-{
-}
-
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 /*
-- 
cgit v1.2.3


From 87c5adf06bfbf14c9d13e59d5d174ff5f2aafc0e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 16 Feb 2022 16:42:08 +0100
Subject: rcu/nocb: Initialize nocb kthreads only for boot CPU prior SMP
 initialization

The rcu_spawn_gp_kthread() function is called as an early initcall, which
means that SMP initialization hasn't happened yet and only the boot CPU is
online. Therefore, create only the NOCB kthreads related to the boot CPU.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c      |  6 +++++-
 kernel/rcu/tree.h      |  1 -
 kernel/rcu/tree_nocb.h | 20 --------------------
 3 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 70b33c55d39a..9f7441a78f90 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4498,9 +4498,13 @@ static int __init rcu_spawn_gp_kthread(void)
 	smp_store_release(&rcu_state.gp_kthread, t);  /* ^^^ */
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	wake_up_process(t);
-	rcu_spawn_nocb_kthreads();
 	/* This is a pre-SMP initcall, we expect a single CPU */
 	WARN_ON(num_online_cpus() > 1);
+	/*
+	 * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+	 * due to rcu_scheduler_fully_active.
+	 */
+	rcu_spawn_cpu_nocb_kthread(smp_processor_id());
 	rcu_spawn_one_boost_kthread(rdp->mynode);
 	rcu_spawn_core_kthreads();
 	return 0;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b2a0f2613ab9..25dc4166f218 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -439,7 +439,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_cpu_nocb_kthread(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
 static void show_rcu_nocb_state(struct rcu_data *rdp);
 static void rcu_nocb_lock(struct rcu_data *rdp);
 static void rcu_nocb_unlock(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3c00240833d6..46694e13398a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1266,22 +1266,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
 }
 
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs.  This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-	int cpu;
-
-	if (rcu_state.nocb_is_setup) {
-		for_each_online_cpu(cpu)
-			rcu_spawn_cpu_nocb_kthread(cpu);
-	}
-}
-
 /* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
 static int rcu_nocb_gp_stride = -1;
 module_param(rcu_nocb_gp_stride, int, 0444);
@@ -1538,10 +1522,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
 {
 }
 
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
 static void show_rcu_nocb_state(struct rcu_data *rdp)
 {
 }
-- 
cgit v1.2.3


From f75fd4b9221d93177c50dcfde671b2e907f53e86 Mon Sep 17 00:00:00 2001
From: Padmanabha Srinivasaiah <treasure4paddy@gmail.com>
Date: Thu, 17 Feb 2022 16:25:19 +0100
Subject: rcu-tasks: Fix race in schedule and flush work

While booting secondary CPUs, cpus_read_[lock/unlock] is not keeping
online cpumask stable. The transient online mask results in below
calltrace.

[    0.324121] CPU1: Booted secondary processor 0x0000000001 [0x410fd083]
[    0.346652] Detected PIPT I-cache on CPU2
[    0.347212] CPU2: Booted secondary processor 0x0000000002 [0x410fd083]
[    0.377255] Detected PIPT I-cache on CPU3
[    0.377823] CPU3: Booted secondary processor 0x0000000003 [0x410fd083]
[    0.379040] ------------[ cut here ]------------
[    0.383662] WARNING: CPU: 0 PID: 10 at kernel/workqueue.c:3084 __flush_work+0x12c/0x138
[    0.384850] Modules linked in:
[    0.385403] CPU: 0 PID: 10 Comm: rcu_tasks_rude_ Not tainted 5.17.0-rc3-v8+ #13
[    0.386473] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT)
[    0.387289] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    0.388308] pc : __flush_work+0x12c/0x138
[    0.388970] lr : __flush_work+0x80/0x138
[    0.389620] sp : ffffffc00aaf3c60
[    0.390139] x29: ffffffc00aaf3d20 x28: ffffffc009c16af0 x27: ffffff80f761df48
[    0.391316] x26: 0000000000000004 x25: 0000000000000003 x24: 0000000000000100
[    0.392493] x23: ffffffffffffffff x22: ffffffc009c16b10 x21: ffffffc009c16b28
[    0.393668] x20: ffffffc009e53861 x19: ffffff80f77fbf40 x18: 00000000d744fcc9
[    0.394842] x17: 000000000000000b x16: 00000000000001c2 x15: ffffffc009e57550
[    0.396016] x14: 0000000000000000 x13: ffffffffffffffff x12: 0000000100000000
[    0.397190] x11: 0000000000000462 x10: ffffff8040258008 x9 : 0000000100000000
[    0.398364] x8 : 0000000000000000 x7 : ffffffc0093c8bf4 x6 : 0000000000000000
[    0.399538] x5 : 0000000000000000 x4 : ffffffc00a976e40 x3 : ffffffc00810444c
[    0.400711] x2 : 0000000000000004 x1 : 0000000000000000 x0 : 0000000000000000
[    0.401886] Call trace:
[    0.402309]  __flush_work+0x12c/0x138
[    0.402941]  schedule_on_each_cpu+0x228/0x278
[    0.403693]  rcu_tasks_rude_wait_gp+0x130/0x144
[    0.404502]  rcu_tasks_kthread+0x220/0x254
[    0.405264]  kthread+0x174/0x1ac
[    0.405837]  ret_from_fork+0x10/0x20
[    0.406456] irq event stamp: 102
[    0.406966] hardirqs last  enabled at (101): [<ffffffc0093c8468>] _raw_spin_unlock_irq+0x78/0xb4
[    0.408304] hardirqs last disabled at (102): [<ffffffc0093b8270>] el1_dbg+0x24/0x5c
[    0.409410] softirqs last  enabled at (54): [<ffffffc0081b80c8>] local_bh_enable+0xc/0x2c
[    0.410645] softirqs last disabled at (50): [<ffffffc0081b809c>] local_bh_disable+0xc/0x2c
[    0.411890] ---[ end trace 0000000000000000 ]---
[    0.413000] smp: Brought up 1 node, 4 CPUs
[    0.413762] SMP: Total of 4 processors activated.
[    0.414566] CPU features: detected: 32-bit EL0 Support
[    0.415414] CPU features: detected: 32-bit EL1 Support
[    0.416278] CPU features: detected: CRC32 instructions
[    0.447021] Callback from call_rcu_tasks_rude() invoked.
[    0.506693] Callback from call_rcu_tasks() invoked.

This commit therefore fixes this issue by applying a single-CPU
optimization to the RCU Tasks Rude grace-period process.  The key point
here is that the purpose of this RCU flavor is to force a schedule on
each online CPU since some past event.  But the rcu_tasks_rude_wait_gp()
function runs in the context of the RCU Tasks Rude's grace-period kthread,
so there must already have been a context switch on the current CPU since
the call to either synchronize_rcu_tasks_rude() or call_rcu_tasks_rude().
So if there is only a single CPU online, RCU Tasks Rude's grace-period
kthread does not need to anything at all.

It turns out that the rcu_tasks_rude_wait_gp() function's call to
schedule_on_each_cpu() causes problems during early boot.  During that
time, there is only one online CPU, namely the boot CPU.  Therefore,
applying this single-CPU optimization fixes early-boot instances of
this problem.

Link: https://lore.kernel.org/lkml/20220210184319.25009-1-treasure4paddy@gmail.com/T/
Suggested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Padmanabha Srinivasaiah <treasure4paddy@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 99cf3a13954c..b43320b149d2 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -950,6 +950,9 @@ static void rcu_tasks_be_rude(struct work_struct *work)
 // Wait for one rude RCU-tasks grace period.
 static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
 {
+	if (num_online_cpus() <= 1)
+		return;	// Fastpath for only one CPU.
+
 	rtp->n_ipis += cpumask_weight(cpu_online_mask);
 	schedule_on_each_cpu(rcu_tasks_be_rude);
 }
-- 
cgit v1.2.3


From f25390033fa2445cdc4d6cf8243a9b85d942845f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 25 Feb 2022 16:01:12 -0800
Subject: rcu-tasks: Print pre-stall-warning informational messages

RCU-tasks stall-warning messages are printed after the grace period is ten
minutes old.  Unfortunately, most of us will have rebooted the system in
response to an apparently-hung command long before the ten minutes is up,
and will thus see what looks to be a silent hang.

This commit therefore adds pr_info() messages that are printed earlier.
These should avoid being classified as errors, but should give impatient
users a hint.  These are controlled by new rcupdate.rcu_task_stall_info
and rcupdate.rcu_task_stall_info_mult kernel-boot parameters.  The former
defines the initial delay in jiffies (defaulting to 10 seconds) and the
latter defines the multiplier (defaulting to 3).  Thus, by default, the
first message will appear 10 seconds into the RCU-tasks grace period,
the second 40 seconds in, and the third 160 seconds in.  There would be
a fourth at 640 seconds in, but the stall warning message appears 600
seconds in, and once a stall warning is printed for a given grace period,
no further informational messages are printed.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 40 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index b43320b149d2..76799c81d4be 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -143,6 +143,11 @@ module_param(rcu_task_ipi_delay, int, 0644);
 #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
 static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
 module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
 
 static int rcu_task_enqueue_lim __read_mostly = -1;
 module_param(rcu_task_enqueue_lim, int, 0444);
@@ -548,8 +553,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
 static void __init rcu_tasks_bootup_oddness(void)
 {
 #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+	int rtsimc;
+
 	if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
 		pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+	rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+	if (rtsimc != rcu_task_stall_info_mult) {
+		pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+		rcu_task_stall_info_mult = rtsimc;
+	}
 #endif /* #ifdef CONFIG_TASKS_RCU */
 #ifdef CONFIG_TASKS_RCU
 	pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
@@ -592,10 +604,15 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t);
 /* Wait for one RCU-tasks grace period. */
 static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 {
-	struct task_struct *g, *t;
-	unsigned long lastreport;
-	LIST_HEAD(holdouts);
+	struct task_struct *g;
 	int fract;
+	LIST_HEAD(holdouts);
+	unsigned long j;
+	unsigned long lastinfo;
+	unsigned long lastreport;
+	bool reported = false;
+	int rtsi;
+	struct task_struct *t;
 
 	set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
 	rtp->pregp_func();
@@ -621,6 +638,8 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 	 * is empty, we are done.
 	 */
 	lastreport = jiffies;
+	lastinfo = lastreport;
+	rtsi = READ_ONCE(rcu_task_stall_info);
 
 	// Start off with initial wait and slowly back off to 1 HZ wait.
 	fract = rtp->init_fract;
@@ -630,7 +649,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 		bool needreport;
 		int rtst;
 
-		/* Slowly back off waiting for holdouts */
+		// Slowly back off waiting for holdouts
 		set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
 		schedule_timeout_idle(fract);
 
@@ -639,12 +658,23 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 
 		rtst = READ_ONCE(rcu_task_stall_timeout);
 		needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
-		if (needreport)
+		if (needreport) {
 			lastreport = jiffies;
+			reported = true;
+		}
 		firstreport = true;
 		WARN_ON(signal_pending(current));
 		set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
 		rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+		// Print pre-stall informational messages if needed.
+		j = jiffies;
+		if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+			lastinfo = j;
+			rtsi = rtsi * rcu_task_stall_info_mult;
+			pr_info("%s: %s grace period %lu is %lu jiffies old.\n",
+				__func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+		}
 	}
 
 	set_tasks_gp_state(rtp, RTGS_POST_GP);
-- 
cgit v1.2.3


From 88db792bbe9b140680c74e9f2f801ac00f54e05e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 4 Mar 2022 12:07:25 +0100
Subject: rcu-tasks: Use rcuwait for the rcu_tasks_kthread()

The waitqueue used by rcu_tasks_kthread() has always only one waiter.
With a guaranteed only one waiter, this can be replaced with rcuwait
which is smaller and simpler. With rcuwait based wake counterpart, the
irqwork function (call_rcu_tasks_iw_wakeup()) can be invoked hardirq
context because it is only a wake up and no sleeping locks are involved
(unlike the wait_queue_head).
As a side effect, this is also one piece of the puzzle to pass the RCU
selftest at early boot on PREEMPT_RT.

Replace wait_queue_head with rcuwait and let the irqwork run in hardirq
context on PREEMPT_RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 76799c81d4be..4b91cb214ca7 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -46,7 +46,7 @@ struct rcu_tasks_percpu {
 
 /**
  * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
- * @cbs_wq: Wait queue allowing new callback to get kthread's attention.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
  * @cbs_gbl_lock: Lock protecting callback list.
  * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
  * @gp_func: This flavor's grace-period-wait function.
@@ -77,7 +77,7 @@ struct rcu_tasks_percpu {
  * @kname: This flavor's kthread name.
  */
 struct rcu_tasks {
-	struct wait_queue_head cbs_wq;
+	struct rcuwait cbs_wait;
 	raw_spinlock_t cbs_gbl_lock;
 	int gp_state;
 	int gp_sleep;
@@ -113,11 +113,11 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
 #define DEFINE_RCU_TASKS(rt_name, gp, call, n)						\
 static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = {			\
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock),		\
-	.rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup),			\
+	.rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup),			\
 };											\
 static struct rcu_tasks rt_name =							\
 {											\
-	.cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq),			\
+	.cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait),				\
 	.cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock),			\
 	.gp_func = gp,									\
 	.call_func = call,								\
@@ -266,7 +266,7 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
 	struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
 
 	rtp = rtpcp->rtpp;
-	wake_up(&rtp->cbs_wq);
+	rcuwait_wake_up(&rtp->cbs_wait);
 }
 
 // Enqueue a callback for the specified flavor of Tasks RCU.
@@ -514,7 +514,9 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 		set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
 
 		/* If there were none, wait a bit and start over. */
-		wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp)));
+		rcuwait_wait_event(&rtp->cbs_wait,
+				   (needgpcb = rcu_tasks_need_gpcb(rtp)),
+				   TASK_IDLE);
 
 		if (needgpcb & 0x2) {
 			// Wait for one grace period.
-- 
cgit v1.2.3


From 5d90070816534882b9158f14154b7e2cdef1194a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 4 Mar 2022 10:41:44 -0800
Subject: rcu-tasks: Make Tasks RCU account for userspace execution

The main Tasks RCU quiescent state is voluntary context switch.  However,
userspace execution is also a valid quiescent state, and is a valuable one
for userspace applications that spin repeatedly executing light-weight
non-sleeping system calls.  Currently, such an application can delay a
Tasks RCU grace period for many tens of seconds.

This commit therefore enlists the aid of the scheduler-clock interrupt to
provide a Tasks RCU quiescent state when it interrupted a task executing
in userspace.

[ paulmck: Apply feedback from kernel test robot. ]

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Neil Spring <ntspring@fb.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..8dbfb63f0391 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2624,6 +2624,8 @@ void rcu_sched_clock_irq(int user)
 	rcu_flavor_sched_clock_irq(user);
 	if (rcu_pending(user))
 		invoke_rcu_core();
+	if (user)
+		rcu_tasks_classic_qs(current, false);
 	lockdep_assert_irqs_disabled();
 
 	trace_rcu_utilization(TPS("End scheduler-tick"));
-- 
cgit v1.2.3


From 777570d9ef820e470736fa9e02b8e3e48891c050 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 8 Mar 2022 09:54:13 -0800
Subject: rcu-tasks: Use schedule_hrtimeout_range() to wait for grace periods

The synchronous RCU-tasks grace-period-wait primitives invoke
schedule_timeout_idle() to give readers a chance to exit their
read-side critical sections.  Unfortunately, this fails during early
boot on PREEMPT_RT because PREEMPT_RT relies solely on ksoftirqd to run
timer handlers.  Because ksoftirqd cannot operate until its kthreads
are spawned, there is a brief period of time following scheduler
initialization where PREEMPT_RT cannot run the timer handlers that
schedule_timeout_idle() relies on, resulting in a hang.

To avoid this boot-time hang, this commit replaces schedule_timeout_idle()
with schedule_hrtimeout(), so that the timer expires in hardirq context.
This is ensures that the timer fires even on PREEMPT_RT throughout the
irqs-enabled portions of boot as well as during runtime.

The timer is set to expire between fract and fract + HZ / 2 jiffies in
order to align with any other timers that might expire during that time,
thus reducing the number of wakeups.

Note that RCU-tasks grace periods are infrequent, so the use of hrtimer
should be fine.  In contrast, in common-case code, user of hrtimer
could result in performance issues.

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 4b91cb214ca7..71fe340ab82a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -647,13 +647,16 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 	fract = rtp->init_fract;
 
 	while (!list_empty(&holdouts)) {
+		ktime_t exp;
 		bool firstreport;
 		bool needreport;
 		int rtst;
 
 		// Slowly back off waiting for holdouts
 		set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
-		schedule_timeout_idle(fract);
+		exp = jiffies_to_nsecs(fract);
+		__set_current_state(TASK_IDLE);
+		schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
 
 		if (fract < HZ)
 			fract++;
-- 
cgit v1.2.3


From bddf7122f7e321d5a677a695e8597064d987482c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 18 Mar 2022 08:08:14 -0700
Subject: rcu-tasks: Restore use of timers for non-RT kernels

The use of hrtimers for RCU-tasks grace-period delays works well in
general, but can result in excessive grace-period delays for some
corner-case workloads.  This commit therefore reverts to the use of
timers for non-RT kernels to mitigate those grace-period delays.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 71fe340ab82a..405614039515 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -654,9 +654,13 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 
 		// Slowly back off waiting for holdouts
 		set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
-		exp = jiffies_to_nsecs(fract);
-		__set_current_state(TASK_IDLE);
-		schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			schedule_timeout_idle(fract);
+		} else {
+			exp = jiffies_to_nsecs(fract);
+			__set_current_state(TASK_IDLE);
+			schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+		}
 
 		if (fract < HZ)
 			fract++;
-- 
cgit v1.2.3


From 10b3742f939c51d53619a31a5c03055c5e0952b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 28 Mar 2022 16:00:05 -0700
Subject: rcu-tasks: Make show_rcu_tasks_generic_gp_kthread() check all CPUs

Currently, the show_rcu_tasks_generic_gp_kthread() function only looks
at CPU 0's callback lists.  Although this is not fatal, it can confuse
debugging efforts in cases where any of the Tasks RCU flavors are in
per-CPU queueing mode.  This commit therefore causes this function to
scan all CPUs' callback queues.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 405614039515..3aad0dfbfaf4 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -582,7 +582,17 @@ static void __init rcu_tasks_bootup_oddness(void)
 /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
 static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 {
-	struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each...
+	int cpu;
+	bool havecbs = false;
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+		if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) {
+			havecbs = true;
+			break;
+		}
+	}
 	pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
 		rtp->kname,
 		tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
@@ -590,7 +600,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 		data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
 		data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
 		".k"[!!data_race(rtp->kthread_ptr)],
-		".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))],
+		".C"[havecbs],
 		s);
 }
 #endif // #ifndef CONFIG_TINY_RCU
-- 
cgit v1.2.3


From 07d95c34e8125a7bf833a94bc3c9d51992d92c45 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Apr 2022 12:30:18 -0700
Subject: rcu-tasks: Handle sparse cpu_possible_mask

If the rcupdate.rcu_task_enqueue_lim kernel boot parameter is set to
something greater than 1 and less than nr_cpu_ids, the code attempts to
use a subset of the CPU's RCU Tasks callback lists.  This works, but only
if the cpu_possible_mask is contiguous.  If there are "holes" in this
mask, the callback-enqueue code might attempt to access a non-existent
per-CPU ->rtcpu variable for a non-existent CPU.  For example, if only
CPUs 0, 4, 8, 12, 16 and so on are in cpu_possible_mask, specifying
rcupdate.rcu_task_enqueue_lim=4 would cause the code to attempt to
use callback queues for non-existent CPUs 1, 2, and 3.  Because such
systems have existed in the past and might still exist, the code needs
to gracefully handle this situation.

This commit therefore checks to see whether the desired CPU is present
in cpu_possible_mask, and, if not, searches for the next CPU.  This means
that the systems administrator of a system with a sparse cpu_possible_mask
will need to account for this sparsity when specifying the value of
the rcupdate.rcu_task_enqueue_lim kernel boot parameter.  For example,
setting this parameter to the value 4 will use only CPUs 0 and 4, which
CPU 4 getting three times the callback load of CPU 0.

This commit assumes that bit (nr_cpu_ids - 1) is always set in
cpu_possible_mask.

Link: https://lore.kernel.org/lkml/CANn89iKaNEwyNZ=L_PQnkH0LP_XjLYrr_dpyRKNNoDJaWKdrmg@mail.gmail.com/
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 3aad0dfbfaf4..fd70d86eb7cd 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -273,7 +273,9 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
 static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 				   struct rcu_tasks *rtp)
 {
+	int chosen_cpu;
 	unsigned long flags;
+	int ideal_cpu;
 	unsigned long j;
 	bool needadjust = false;
 	bool needwake;
@@ -283,8 +285,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
 	rhp->func = func;
 	local_irq_save(flags);
 	rcu_read_lock();
-	rtpcp = per_cpu_ptr(rtp->rtpcpu,
-			    smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift));
+	ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+	chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+	rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
 	if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
 		raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
 		j = jiffies;
-- 
cgit v1.2.3


From ab2756ea6b74987849b44ad0e33c3cfec159033b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 8 Apr 2022 09:21:50 -0700
Subject: rcu-tasks: Handle sparse cpu_possible_mask in rcu_tasks_invoke_cbs()

If the cpu_possible_mask is sparse (for example, if bits are set only for
CPUs 0, 4, 8, ...), then rcu_tasks_invoke_cbs() will access per-CPU data
for a CPU not in cpu_possible_mask.  It makes these accesses while doing
a workqueue-based binary search for non-empty callback lists.  Although
this search must pass through CPUs not represented in cpu_possible_mask,
it has no need to check the callback list for such CPUs.

This commit therefore changes the rcu_tasks_invoke_cbs() function's
binary search so as to only check callback lists for CPUs present in
cpu_possible_mask.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index fd70d86eb7cd..3925e32159b5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -468,7 +468,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 		}
 	}
 
-	if (rcu_segcblist_empty(&rtpcp->cblist))
+	if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
 		return;
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 	rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
-- 
cgit v1.2.3


From 99d6a2acb8955f12489bfba04f2db22bc0b57726 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 4 Feb 2022 12:45:18 -0800
Subject: rcutorture: Suppress debugging grace period delays during flooding

Tree RCU supports grace-period delays using the rcutree.gp_cleanup_delay,
rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot
parameters.  These delays are strictly for debugging purposes, and have
proven quite effective at exposing bugs involving race with CPU-hotplug
operations.  However, these delays can result in false positives when
used in conjunction with callback flooding, for example, those generated
by the rcutorture.fwd_progress kernel boot parameter.

This commit therefore suppresses grace-period delays while callback
flooding is in progress.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu.h        |  4 ++++
 kernel/rcu/rcutorture.c |  4 ++++
 kernel/rcu/tree.c       | 32 +++++++++++++++++++++++++++++---
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..7a221393fcdb 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -523,6 +523,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret
 static inline void show_rcu_gp_kthreads(void) { }
 static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
 static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
 unsigned long rcu_get_gp_seq(void);
@@ -535,6 +537,8 @@ void rcu_fwd_progress_check(unsigned long j);
 void rcu_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
 extern struct workqueue_struct *rcu_par_gp_wq;
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 55d049c39608..f37b7a01dcd0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2916,10 +2916,12 @@ rcu_torture_cleanup(void)
 			pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
 			cur_ops->cb_barrier();
 		}
+		rcu_gp_slow_unregister(NULL);
 		return;
 	}
 	if (!cur_ops) {
 		torture_cleanup_end();
+		rcu_gp_slow_unregister(NULL);
 		return;
 	}
 
@@ -3016,6 +3018,7 @@ rcu_torture_cleanup(void)
 	else
 		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 	torture_cleanup_end();
+	rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
 }
 
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -3320,6 +3323,7 @@ rcu_torture_init(void)
 	if (object_debug)
 		rcu_test_debug_objects();
 	torture_init_end();
+	rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
 	return 0;
 
 unwind:
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..db67dae8ed88 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1705,11 +1705,37 @@ static void note_gp_changes(struct rcu_data *rdp)
 		rcu_gp_kthread_wake();
 }
 
+static atomic_t *rcu_gp_slow_suppress;
+
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+	WARN_ON_ONCE(rcu_gp_slow_suppress);
+
+	WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
+{
+	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+
+	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+
+static bool rcu_gp_slow_is_suppressed(void)
+{
+	atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+
+	return rgssp && atomic_read(rgssp);
+}
+
 static void rcu_gp_slow(int delay)
 {
-	if (delay > 0 &&
-	    !(rcu_seq_ctr(rcu_state.gp_seq) %
-	      (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+	if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+	    !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
 		schedule_timeout_idle(delay);
 }
 
-- 
cgit v1.2.3


From 8106bddbab5f0ba180e6d693c7c1fc6926d57caa Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 28 Feb 2022 17:40:49 -0800
Subject: scftorture: Fix distribution of short handler delays

The scftorture test module's scf_handler() function is supposed to provide
three different distributions of short delays (including "no delay") and
one distribution of long delays, if specified by the scftorture.longwait
module parameter.  However, the second of the two non-zero-wait short delays
is disabled due to the first such delay's "goto out" not being enclosed in
the "then" clause with the "udelay()".

This commit therefore adjusts the code to provide the intended set of
delays.

Fixes: e9d338a0b179 ("scftorture: Add smp_call_function() torture test")
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index dcb0410950e4..5d113aa59e77 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in)
 	}
 	this_cpu_inc(scf_invoked_count);
 	if (longwait <= 0) {
-		if (!(r & 0xffc0))
+		if (!(r & 0xffc0)) {
 			udelay(r & 0x3f);
-		goto out;
+			goto out;
+		}
 	}
 	if (r & 0xfff)
 		goto out;
-- 
cgit v1.2.3


From 39b3cab92d3754e18b1f9b5de8158642145b2405 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 7 Mar 2022 14:46:55 -0800
Subject: rcutorture: Avoid corner-case #DE with nsynctypes check

The rcutorture module is used to run torture tests that validate RCU.
rcutorture takes a variety of module parameters that configure the
functionality of the test. Amongst these parameters are the types of
synchronization mechanisms that the rcu_torture_writer and
rcu_torture_fakewriter tasks may use, and the torture_type of the run which
determines what read and sync operations are used by the various writer and
reader tasks that run throughout the test.

When the module is configured to only use sync types for which the
specified torture_type does not implement the necessary operations, we can
end up in a state where nsynctypes is 0. This is not an erroneous state,
but it currently crashes the kernel with a #DE due to nsynctypes being used
with a modulo operator in rcu_torture_fakewriter().

Here is an example of such a #DE:

$ insmod ./rcutorture.ko gp_cond=1 gp_cond_exp=0 gp_exp=0 gp_poll_exp=0
gp_normal=0 gp_poll=0 gp_poll_exp=0 verbose=9999 torture_type=trivial

...

[ 8536.525096] divide error: 0000 [#1] PREEMPT SMP PTI
[ 8536.525101] CPU: 30 PID: 392138 Comm: rcu_torture_fak Kdump: loaded Tainted: G S                5.17.0-rc1-00179-gc8c42c80febd #24
[ 8536.525105] Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020
[ 8536.525106] RIP: 0010:rcu_torture_fakewriter+0xf1/0x2d0 [rcutorture]
[ 8536.525121] Code: 00 31 d2 8d 0c f5 00 00 00 00 48 63 c9 48 f7 f1 48 85 d2 0f 84 79 ff ff ff 48 89 e7 e8 78 78 01 00 48 63 0d 29 ca 00 00 31 d2 <48> f7 f1 8b 04 95 00 05 4e a0 83 f8 06 0f 84 ad 00 00 00 7f 1f 83
[ 8536.525124] RSP: 0018:ffffc9000777fef0 EFLAGS: 00010246
[ 8536.525127] RAX: 00000000223d006e RBX: cccccccccccccccd RCX: 0000000000000000
[ 8536.525130] RDX: 0000000000000000 RSI: ffffffff824315b9 RDI: ffffc9000777fef0
[ 8536.525132] RBP: ffffc9000487bb30 R08: 0000000000000002 R09: 000000000002a580
[ 8536.525134] R10: ffffffff82c5f920 R11: 0000000000000000 R12: ffff8881a2c35d00
[ 8536.525136] R13: ffff8881540c8d00 R14: ffffffffa04d39d0 R15: 0000000000000000
[ 8536.525137] FS:  0000000000000000(0000) GS:ffff88903ff80000(0000) knlGS:0000000000000000
[ 8536.525140] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 8536.525142] CR2: 00007f839f022000 CR3: 0000000002c0a006 CR4: 00000000007706e0
[ 8536.525144] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 8536.525145] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 8536.525147] PKRU: 55555554
[ 8536.525148] Call Trace:
[ 8536.525150]  <TASK>
[ 8536.525153]  kthread+0xe8/0x110
[ 8536.525161]  ? kthread_complete_and_exit+0x20/0x20
[ 8536.525167]  ret_from_fork+0x22/0x30
[ 8536.525174]  </TASK>

The solution is to gracefully handle the case of nsynctypes being 0 in
rcu_torture_fakewriter() by not performing any work. This is already being
done in rcu_torture_writer(), though there is a missing return on that path
which will be fixed in a subsequent patch.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f37b7a01dcd0..d5105fb6c980 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1322,6 +1322,17 @@ rcu_torture_fakewriter(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
 	set_user_nice(current, MAX_NICE);
 
+	if (WARN_ONCE(nsynctypes == 0,
+		      "%s: No update-side primitives.\n", __func__)) {
+		/*
+		 * No updates primitives, so don't try updating.
+		 * The resulting test won't be testing much, hence the
+		 * above WARN_ONCE().
+		 */
+		torture_kthread_stopping("rcu_torture_fakewriter");
+		return 0;
+	}
+
 	do {
 		torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
 		if (cur_ops->cb_barrier != NULL &&
-- 
cgit v1.2.3


From 80dcee695143255261f30c7cc2a041ba413717a4 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 7 Mar 2022 14:46:57 -0800
Subject: rcutorture: Add missing return and use __func__ in warning

The rcutorture module has an rcu_torture_writer task that repeatedly
performs writes, synchronizations, and deletes. There is a corner-case
check in rcu_torture_writer() wherein if nsynctypes is 0, a warning is
issued and the task waits to be stopped via a call to
torture_kthread_stopping() rather than performing any work.

There should be a return statement following this call to
torture_kthread_stopping(), as the intention with issuing the call to
torture_kthread_stopping() in the first place is to avoid the
rcu_torture_writer task from performing any work. Some of the work may even
be dangerous to perform, such as potentially causing a #DE due to
nsynctypes being used in a modulo operator when querying for sync updates
to issue.

This patch adds the missing return call.  As a bonus, it also fixes a
checkpatch warning that was emitted due to the WARN_ONCE() call using the
name of the function rather than __func__.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d5105fb6c980..f1292d9e86b5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1178,7 +1178,7 @@ rcu_torture_writer(void *arg)
 			 " GP expediting controlled from boot/sysfs for %s.\n",
 			 torture_type, cur_ops->name);
 	if (WARN_ONCE(nsynctypes == 0,
-		      "rcu_torture_writer: No update-side primitives.\n")) {
+		      "%s: No update-side primitives.\n", __func__)) {
 		/*
 		 * No updates primitives, so don't try updating.
 		 * The resulting test won't be testing much, hence the
@@ -1186,6 +1186,7 @@ rcu_torture_writer(void *arg)
 		 */
 		rcu_torture_writer_state = RTWS_STOPPING;
 		torture_kthread_stopping("rcu_torture_writer");
+		return 0;
 	}
 
 	do {
-- 
cgit v1.2.3


From 46e861be589881e0905b9ade3d8439883858721c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Mar 2022 09:30:10 -0700
Subject: rcu: Make TASKS_RUDE_RCU select IRQ_WORK

The TASKS_RUDE_RCU does not select IRQ_WORK, which can result in build
failures for kernels that do not otherwise select IRQ_WORK.  This commit
therefore causes the TASKS_RUDE_RCU Kconfig option to select IRQ_WORK.

Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bf8e341e75b4..f559870fbf8b 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -86,6 +86,7 @@ config TASKS_RCU
 
 config TASKS_RUDE_RCU
 	def_bool 0
+	select IRQ_WORK
 	help
 	  This option enables a task-based RCU implementation that uses
 	  only context switch (including preemption) and user-mode
-- 
cgit v1.2.3


From d22959aa93528c6cf4583560696856cf6bba6b72 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Feb 2022 09:10:04 -0800
Subject: rcu: Clarify fill-the-gap comment in rcu_segcblist_advance()

Reported-by: Frederic Weisbecker <frederic@kernel.org>
Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reported-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 81145c3ece25..c54ea2b6a36b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
 
 	/*
-	 * Callbacks moved, so clean up the misordered ->tails[] pointers
-	 * that now point into the middle of the list of ready-to-invoke
-	 * callbacks.  The overall effect is to copy down the later pointers
-	 * into the gap that was created by the now-ready segments.
+	 * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+	 * and a non-empty RCU_NEXT_READY_TAIL.  If so, copy the
+	 * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+	 * created by the now-ready-to-invoke segments.
 	 */
 	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
 		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-- 
cgit v1.2.3


From 90d2efe7bdbde5371b6122174af0718843f805c6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 16 Feb 2022 09:54:56 -0800
Subject: rcu: Fix rcu_preempt_deferred_qs_irqrestore() strict QS reporting

Suppose we have a kernel built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y
and CONFIG_PREEMPT=y.  Suppose further that an RCU reader from which RCU
core needs a quiescent state ends in rcu_preempt_deferred_qs_irqrestore().
This function will then invoke rcu_report_qs_rdp() in order to immediately
report that quiescent state.  Unfortunately, it will not have cleared
that reader's CPU's rcu_data structure's ->cpu_no_qs.b.norm field.
As a result, rcu_report_qs_rdp() will take an early exit because it
will believe that this CPU has not yet encountered a quiescent state,
and there will be no reporting of the current quiescent state.

This commit therefore causes rcu_preempt_deferred_qs_irqrestore() to
clear the ->cpu_no_qs.b.norm field before invoking rcu_report_qs_rdp().

Kudos to Boqun Feng and Neeraj Upadhyay for helping with analysis of
this issue!

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8360d86db1c0..176639c6215f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	t->rcu_read_unlock_special.s = 0;
 	if (special.b.need_qs) {
 		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+			rdp->cpu_no_qs.b.norm = false;
 			rcu_report_qs_rdp(rdp);
 			udelay(rcu_unlock_delay);
 		} else {
-- 
cgit v1.2.3


From c708b08c65a0dfae127b9ee33b0fb73535a5e066 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 23 Feb 2022 17:29:37 -0800
Subject: rcu: Check for jiffies going backwards

A report of a 12-jiffy normal RCU CPU stall warning raises interesting
questions about the nature of time on the offending system.  This commit
instruments rcu_sched_clock_irq(), which is RCU's hook into the
scheduling-clock interrupt, checking for the jiffies counter going
backwards.

Reported-by: Saravanan D <sarvanand@fb.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 10 ++++++++++
 kernel/rcu/tree.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..a5ea67454640 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1679,6 +1679,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
 	rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */
 	if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
 		WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+	if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+		WRITE_ONCE(rdp->last_sched_clock, jiffies);
 	WRITE_ONCE(rdp->gpwrap, false);
 	rcu_gpnum_ovf(rnp, rdp);
 	return ret;
@@ -2609,6 +2611,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
  */
 void rcu_sched_clock_irq(int user)
 {
+	unsigned long j;
+
+	if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+		j = jiffies;
+		WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+		__this_cpu_write(rcu_data.last_sched_clock, j);
+	}
 	trace_rcu_utilization(TPS("Start scheduler-tick"));
 	lockdep_assert_irqs_disabled();
 	raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -4179,6 +4188,7 @@ rcu_boot_init_percpu_data(int cpu)
 	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
 	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
 	rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+	rdp->last_sched_clock = jiffies;
 	rdp->cpu = cpu;
 	rcu_boot_init_nocb_percpu_data(rdp);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 926673ebe355..94b55f669915 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -254,6 +254,7 @@ struct rcu_data {
 	unsigned long rcu_onl_gp_seq;	/* ->gp_seq at last online. */
 	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */
 	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
+	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
 
 	int cpu;
 };
-- 
cgit v1.2.3


From c7dfb2591b23856182e2c9da6b1d526ef7fd6b2e Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Wed, 9 Feb 2022 09:02:14 +0100
Subject: cpu/hotplug: Allow the CPU in CPU_UP_PREPARE state to be brought up
 again.

A CPU will not show up in virtualized environment which includes an
Enclave. The VM splits its resources into a primary VM and a Enclave
VM. While the Enclave is active, the hypervisor will ignore all requests to
bring up a CPU and this CPU will remain in CPU_UP_PREPARE state.

The kernel will wait up to ten seconds for CPU to show up (do_boot_cpu())
and then rollback the hotplug state back to CPUHP_OFFLINE leaving the CPU
state in CPU_UP_PREPARE. The CPU state is set back to CPUHP_TEARDOWN_CPU
during the CPU_POST_DEAD stage.

After the Enclave VM terminates, the primary VM can bring up the CPU
again.

Allow to bring up the CPU if it is in the CPU_UP_PREPARE state.

[bigeasy: Rewrite commit description.]

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Dongli Zhang <dongli.zhang@oracle.com>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Henry Wang <Henry.Wang@arm.com>
Link: https://lore.kernel.org/r/20220209080214.1439408-3-bigeasy@linutronix.de
---
 kernel/smpboot.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f6bc0bc8a2aa..b9f54544e749 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -392,6 +392,13 @@ int cpu_check_up_prepare(int cpu)
 		 */
 		return -EAGAIN;
 
+	case CPU_UP_PREPARE:
+		/*
+		 * Timeout while waiting for the CPU to show up. Allow to try
+		 * again later.
+		 */
+		return 0;
+
 	default:
 
 		/* Should not happen.  Famous last words. */
-- 
cgit v1.2.3


From fc3a9a9858478ab5f8441765a3f9552a0ceba10c Mon Sep 17 00:00:00 2001
From: Pierre Gondois <Pierre.Gondois@arm.com>
Date: Mon, 21 Mar 2022 09:57:23 +0000
Subject: PM: EM: Add artificial EM flag

The Energy Model (EM) can be used on platforms which are missing real
power information. Those platforms would implement .get_cost() which
populates needed values for the Energy Aware Scheduler (EAS). The EAS
doesn't use 'power' fields from EM, but other frameworks might use them.
Thus, to avoid miss-usage of this specific type of EM, introduce a new
flags which can be checked by other frameworks.

Signed-off-by: Pierre Gondois <Pierre.Gondois@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 0153b0ca7b23..6ecee99af842 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -384,6 +384,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 
 	if (milliwatts)
 		dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+	else if (cb->get_cost)
+		dev->em_pd->flags |= EM_PERF_DOMAIN_ARTIFICIAL;
 
 	em_cpufreq_update_efficiencies(dev);
 
-- 
cgit v1.2.3


From 91362463114eb63ead5f02c11d58c46064c339e7 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 21 Mar 2022 09:57:24 +0000
Subject: PM: EM: Use the new .get_cost() callback while registering EM

The Energy Model (EM) allows to provide the 'cost' values when the device
driver provides the .get_cost() optional callback. This removes
restriction which is in the EM calculation function of the 'cost'
for each performance state. Now, the driver is in charge of providing
the right values which are then used by Energy Aware Scheduler.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 6ecee99af842..95a3b33001f6 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -121,7 +121,8 @@ static void em_debug_remove_pd(struct device *dev) {}
 #endif
 
 static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
-				int nr_states, struct em_data_callback *cb)
+				int nr_states, struct em_data_callback *cb,
+				unsigned long flags)
 {
 	unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
 	struct em_perf_state *table;
@@ -173,10 +174,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 	/* Compute the cost of each performance state. */
 	fmax = (u64) table[nr_states - 1].frequency;
 	for (i = nr_states - 1; i >= 0; i--) {
-		unsigned long power_res = em_scale_power(table[i].power);
+		unsigned long power_res, cost;
+
+		if (flags & EM_PERF_DOMAIN_ARTIFICIAL) {
+			ret = cb->get_cost(dev, table[i].frequency, &cost);
+			if (ret || !cost || cost > EM_MAX_POWER) {
+				dev_err(dev, "EM: invalid cost %lu %d\n",
+					cost, ret);
+				goto free_ps_table;
+			}
+		} else {
+			power_res = em_scale_power(table[i].power);
+			cost = div64_u64(fmax * power_res, table[i].frequency);
+		}
+
+		table[i].cost = cost;
 
-		table[i].cost = div64_u64(fmax * power_res,
-					  table[i].frequency);
 		if (table[i].cost >= prev_cost) {
 			table[i].flags = EM_PERF_STATE_INEFFICIENT;
 			dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
@@ -197,7 +210,8 @@ free_ps_table:
 }
 
 static int em_create_pd(struct device *dev, int nr_states,
-			struct em_data_callback *cb, cpumask_t *cpus)
+			struct em_data_callback *cb, cpumask_t *cpus,
+			unsigned long flags)
 {
 	struct em_perf_domain *pd;
 	struct device *cpu_dev;
@@ -215,7 +229,7 @@ static int em_create_pd(struct device *dev, int nr_states,
 			return -ENOMEM;
 	}
 
-	ret = em_create_perf_table(dev, pd, nr_states, cb);
+	ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
 	if (ret) {
 		kfree(pd);
 		return ret;
@@ -332,6 +346,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				bool milliwatts)
 {
 	unsigned long cap, prev_cap = 0;
+	unsigned long flags = 0;
 	int cpu, ret;
 
 	if (!dev || !nr_states || !cb)
@@ -378,14 +393,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 		}
 	}
 
-	ret = em_create_pd(dev, nr_states, cb, cpus);
+	if (milliwatts)
+		flags |= EM_PERF_DOMAIN_MILLIWATTS;
+	else if (cb->get_cost)
+		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+	ret = em_create_pd(dev, nr_states, cb, cpus, flags);
 	if (ret)
 		goto unlock;
 
-	if (milliwatts)
-		dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
-	else if (cb->get_cost)
-		dev->em_pd->flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+	dev->em_pd->flags |= flags;
 
 	em_cpufreq_update_efficiencies(dev);
 
-- 
cgit v1.2.3


From 75a3a99a5a9886af13be44e640cb415ebda80db2 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 21 Mar 2022 09:57:25 +0000
Subject: PM: EM: Change the order of arguments in the .active_power() callback

The .active_power() callback passes the device pointer when it's called.
Aligned with a convetion present in other subsystems and pass the 'dev'
as a first argument. It looks more cleaner.

Adjust all affected drivers which implement that API callback.

Suggested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 95a3b33001f6..babefc72085d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -140,7 +140,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 		 * lowest performance state of 'dev' above 'freq' and updates
 		 * 'power' and 'freq' accordingly.
 		 */
-		ret = cb->active_power(&power, &freq, dev);
+		ret = cb->active_power(dev, &power, &freq);
 		if (ret) {
 			dev_err(dev, "EM: invalid perf. state: %d\n",
 				ret);
-- 
cgit v1.2.3


From 16857482b830544f5849daa1048f82d00b68957e Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Mon, 21 Mar 2022 09:57:26 +0000
Subject: PM: EM: Remove old debugfs files and print all 'flags'

The Energy Model gets more bits used in 'flags'. Avoid adding another
debugfs file just to print what is the status of a new flag. Simply
remove old debugfs files and add one generic which prints all flags
as a hex value.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index babefc72085d..092513575e4e 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
 
-static int em_debug_units_show(struct seq_file *s, void *unused)
+static int em_debug_flags_show(struct seq_file *s, void *unused)
 {
 	struct em_perf_domain *pd = s->private;
-	char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
-		"milliWatts" : "bogoWatts";
 
-	seq_printf(s, "%s\n", units);
+	seq_printf(s, "%#lx\n", pd->flags);
 
 	return 0;
 }
-DEFINE_SHOW_ATTRIBUTE(em_debug_units);
-
-static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
-{
-	struct em_perf_domain *pd = s->private;
-	int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
-
-	seq_printf(s, "%d\n", enabled);
-
-	return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
 
 static void em_debug_create_pd(struct device *dev)
 {
@@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev)
 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
 				    &em_debug_cpus_fops);
 
-	debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
-	debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
-			    &em_debug_skip_inefficiencies_fops);
+	debugfs_create_file("flags", 0444, d, dev->em_pd,
+			    &em_debug_flags_fops);
 
 	/* Create a sub-directory for each performance state */
 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
-- 
cgit v1.2.3


From ae20cb9aec91ec3c668e413b93a6080803464ae5 Mon Sep 17 00:00:00 2001
From: David Cohen <dacohen@pm.me>
Date: Thu, 24 Mar 2022 08:07:24 +0000
Subject: PM: sleep: Narrow down -DDEBUG on kernel/power/ files

The macro -DDEBUG is broadly enabled on kernel/power/ directory if
CONFIG_DYNAMIC_DEBUG is enabled. As side effect all debug messages using
pr_debug() and dev_dbg() are enabled by default on dynamic debug.
We're reworking pm_pr_dbg() to support dynamic debug, where pm_pr_dbg()
will print message if either pm_debug_messages_on flag is set or if it's
explicitly enabled on dynamic debug's control. That means if we let
-DDEBUG broadly set, the pm_debug_messages_on flag will be bypassed by
default on pm_pr_dbg() if dynamic debug is also enabled.

The files that directly use pr_debug() and dev_dbg() on kernel/power/ are:
 - swap.c
 - snapshot.c
 - energy_model.c

And those files do not use pm_pr_dbg(). So if we limit -DDEBUG to them,
we keep the same functional behavior while allowing the pm_pr_dbg()
refactor.

Signed-off-by: David Cohen <dacohen@pm.me>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/Makefile  | 6 +++++-
 kernel/power/process.c | 3 ---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8bef..874ad834dc8d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 
-ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o                   := -DDEBUG
+CFLAGS_snapshot.o               := -DDEBUG
+CFLAGS_energy_model.o           := -DDEBUG
+endif
 
 KASAN_SANITIZE_snapshot.o	:= n
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 11b570fcf049..3068601e585a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -6,9 +6,6 @@
  * Originally from swsusp.
  */
 
-
-#undef DEBUG
-
 #include <linux/interrupt.h>
 #include <linux/oom.h>
 #include <linux/suspend.h>
-- 
cgit v1.2.3


From ce1cb680ff1c5b88505f878137796b1723e00193 Mon Sep 17 00:00:00 2001
From: David Cohen <dacohen@pm.me>
Date: Thu, 24 Mar 2022 08:07:30 +0000
Subject: PM: sleep: enable dynamic debug support within pm_pr_dbg()

Currently pm_pr_dbg() is used to filter kernel pm debug messages based
on pm_debug_messages_on flag. The problem is if we enable/disable this
flag it will affect all pm_pr_dbg() calls at once, so we can't
individually control them.

This patch changes pm_pr_dbg() implementation as such:

 - If pm_debug_messages_on is enabled, print the message.
 - If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is
   enabled, only print the messages explicitly enabled on
   /sys/kernel/debug/dynamic_debug/control.
 - If pm_debug_messages_on is disabled and CONFIG_DYNAMIC_DEBUG is
   disabled, don't print the message.

Signed-off-by: David Cohen <dacohen@pm.me>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/main.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7e646079fbeb..5242bf2ee469 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -545,35 +545,6 @@ static int __init pm_debug_messages_setup(char *str)
 }
 __setup("pm_debug_messages", pm_debug_messages_setup);
 
-/**
- * __pm_pr_dbg - Print a suspend debug message to the kernel log.
- * @defer: Whether or not to use printk_deferred() to print the message.
- * @fmt: Message format.
- *
- * The message will be emitted if enabled through the pm_debug_messages
- * sysfs attribute.
- */
-void __pm_pr_dbg(bool defer, const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	if (!pm_debug_messages_on)
-		return;
-
-	va_start(args, fmt);
-
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	if (defer)
-		printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
-	else
-		printk(KERN_DEBUG "PM: %pV", &vaf);
-
-	va_end(args);
-}
-
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
 #endif /* CONFIG_PM_SLEEP_DEBUG */
-- 
cgit v1.2.3


From 467df4cfdc10cb497ac8a977458e50b8266f915f Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Fri, 25 Mar 2022 09:44:15 +0800
Subject: PM: hibernate: Fix some kernel-doc comments

Add parameter description in alloc_rtree_node()  kernel-doc
comment and fix several inconsistent function name descriptions.

Remove some warnings found by running scripts/kernel-doc,
which is caused by using 'make W=1'.

kernel/power/snapshot.c:438: warning: Function parameter or member
'gfp_mask' not described in 'alloc_rtree_node'
kernel/power/snapshot.c:438: warning: Function parameter or member
'safe_needed' not described in 'alloc_rtree_node'
kernel/power/snapshot.c:438: warning: Function parameter or member 'ca'
not described in 'alloc_rtree_node'
kernel/power/snapshot.c:438: warning: Function parameter or member
'list' not described in 'alloc_rtree_node'
kernel/power/snapshot.c:916: warning: expecting prototype for
memory_bm_rtree_next_pfn(). Prototype was for memory_bm_next_pfn()
instead
kernel/power/snapshot.c:1947: warning: expecting prototype for
alloc_highmem_image_pages(). Prototype was for alloc_highmem_pages()
instead
kernel/power/snapshot.c:2230: warning: expecting prototype for load
header(). Prototype was for load_header() instead

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
[ rjw: Comment adjustments to avoid line breaks ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 330d49937692..8e4fbf44b376 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -427,6 +427,10 @@ struct memory_bitmap {
 
 /**
  * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
  *
  * This function is used to allocate inner nodes as well as the
  * leave nodes of the radix tree. It also adds the node to the
@@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 }
 
 /**
- * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
  * @bm: Memory bitmap.
  *
  * Starting from the last returned position this function searches for the next
@@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed)
 }
 
 /**
- * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
  *
  * Try to allocate as many pages as needed, but if the number of free highmem
  * pages is less than that, allocate them all.
@@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info)
 }
 
 /**
- * load header - Check the image header and copy the data from it.
+ * load_header - Check the image header and copy the data from it.
  */
 static int load_header(struct swsusp_info *info)
 {
-- 
cgit v1.2.3


From e5a3b0c5b6a194c94d6d64308311eff7d46e57c0 Mon Sep 17 00:00:00 2001
From: Haowen Bai <baihaowen@meizu.com>
Date: Fri, 1 Apr 2022 18:01:34 +0800
Subject: PM: hibernate: Don't mark comment as kernel-doc

Change the comment to a normal (non-kernel-doc) comment to avoid
these kernel-doc warnings:

kernel/power/snapshot.c:335: warning: This comment starts with '/**', but
 isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Data types related to memory bitmaps.

Signed-off-by: Haowen Bai <baihaowen@meizu.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8e4fbf44b376..2a406753af90 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	return ret;
 }
 
-/**
+/*
  * Data types related to memory bitmaps.
  *
  * Memory bitmap is a structure consisting of many linked lists of
-- 
cgit v1.2.3


From 9e949a3886356fe9112c6f6f34a6e23d1d35407f Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Sat, 19 Mar 2022 00:20:15 -0700
Subject: smp: Fix offline cpu check in flush_smp_call_function_queue()

The check in flush_smp_call_function_queue() for callbacks that are sent
to offline CPUs currently checks whether the queue is empty.

However, flush_smp_call_function_queue() has just deleted all the
callbacks from the queue and moved all the entries into a local list.
This checks would only be positive if some callbacks were added in the
short time after llist_del_all() was called. This does not seem to be
the intention of this check.

Change the check to look at the local list to which the entries were
moved instead of the queue from which all the callbacks were just
removed.

Fixes: 8d056c48e4862 ("CPU hotplug, smp: flush any pending IPI callbacks before CPU offline")
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220319072015.1495036-1-namit@vmware.com
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 01a7c1706a58..65a630f62363 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -579,7 +579,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 
 	/* There shouldn't be any pending callbacks on an offline CPU. */
 	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
-		     !warned && !llist_empty(head))) {
+		     !warned && entry != NULL)) {
 		warned = true;
 		WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 
-- 
cgit v1.2.3


From b7ba6d8dc3569e49800ef0136799f26f43e237e8 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Mon, 11 Apr 2022 16:22:32 +0100
Subject: cpu/hotplug: Remove the 'cpu' member of cpuhp_cpu_state

Currently the setting of the 'cpu' member of struct cpuhp_cpu_state in
cpuhp_create() is too late as it is used earlier in _cpu_up().

If kzalloc_node() in __smpboot_create_thread() fails then the rollback will
be done with st->cpu==0 causing CPU0 to be erroneously set to be dying,
causing the scheduler to get mightily confused and throw its toys out of
the pram.

However the cpu number is actually available directly, so simply remove
the 'cpu' member and avoid the problem in the first place.

Fixes: 2ea46c6fc945 ("cpumask/hotplug: Fix cpu_dying() state tracking")
Signed-off-by: Steven Price <steven.price@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220411152233.474129-2-steven.price@arm.com
---
 kernel/cpu.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5797c2a7a93f..d0a9aa0b42e8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -71,7 +71,6 @@ struct cpuhp_cpu_state {
 	bool			rollback;
 	bool			single;
 	bool			bringup;
-	int			cpu;
 	struct hlist_node	*node;
 	struct hlist_node	*last;
 	enum cpuhp_state	cb_state;
@@ -475,7 +474,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
 #endif
 
 static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
 {
 	enum cpuhp_state prev_state = st->state;
 	bool bringup = st->state < target;
@@ -486,14 +485,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
 	st->target = target;
 	st->single = false;
 	st->bringup = bringup;
-	if (cpu_dying(st->cpu) != !bringup)
-		set_cpu_dying(st->cpu, !bringup);
+	if (cpu_dying(cpu) != !bringup)
+		set_cpu_dying(cpu, !bringup);
 
 	return prev_state;
 }
 
 static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+		  enum cpuhp_state prev_state)
 {
 	bool bringup = !st->bringup;
 
@@ -520,8 +520,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
 	}
 
 	st->bringup = bringup;
-	if (cpu_dying(st->cpu) != !bringup)
-		set_cpu_dying(st->cpu, !bringup);
+	if (cpu_dying(cpu) != !bringup)
+		set_cpu_dying(cpu, !bringup);
 }
 
 /* Regular hotplug invocation of the AP hotplug thread */
@@ -541,15 +541,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
 	wait_for_ap_thread(st, st->bringup);
 }
 
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+			 enum cpuhp_state target)
 {
 	enum cpuhp_state prev_state;
 	int ret;
 
-	prev_state = cpuhp_set_state(st, target);
+	prev_state = cpuhp_set_state(cpu, st, target);
 	__cpuhp_kick_ap(st);
 	if ((ret = st->result)) {
-		cpuhp_reset_state(st, prev_state);
+		cpuhp_reset_state(cpu, st, prev_state);
 		__cpuhp_kick_ap(st);
 	}
 
@@ -581,7 +582,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
 	if (st->target <= CPUHP_AP_ONLINE_IDLE)
 		return 0;
 
-	return cpuhp_kick_ap(st, st->target);
+	return cpuhp_kick_ap(cpu, st, st->target);
 }
 
 static int bringup_cpu(unsigned int cpu)
@@ -704,7 +705,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 			 ret, cpu, cpuhp_get_step(st->state)->name,
 			 st->state);
 
-		cpuhp_reset_state(st, prev_state);
+		cpuhp_reset_state(cpu, st, prev_state);
 		if (can_rollback_cpu(st))
 			WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
 							    prev_state));
@@ -721,7 +722,6 @@ static void cpuhp_create(unsigned int cpu)
 
 	init_completion(&st->done_up);
 	init_completion(&st->done_down);
-	st->cpu = cpu;
 }
 
 static int cpuhp_should_run(unsigned int cpu)
@@ -875,7 +875,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 	cpuhp_lock_release(true);
 
 	trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
-	ret = cpuhp_kick_ap(st, st->target);
+	ret = cpuhp_kick_ap(cpu, st, st->target);
 	trace_cpuhp_exit(cpu, st->state, prev_state, ret);
 
 	return ret;
@@ -1107,7 +1107,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 			 ret, cpu, cpuhp_get_step(st->state)->name,
 			 st->state);
 
-		cpuhp_reset_state(st, prev_state);
+		cpuhp_reset_state(cpu, st, prev_state);
 
 		if (st->state < prev_state)
 			WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
@@ -1134,7 +1134,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	prev_state = cpuhp_set_state(st, target);
+	prev_state = cpuhp_set_state(cpu, st, target);
 	/*
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread.
@@ -1165,7 +1165,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	ret = cpuhp_down_callbacks(cpu, st, target);
 	if (ret && st->state < prev_state) {
 		if (st->state == CPUHP_TEARDOWN_CPU) {
-			cpuhp_reset_state(st, prev_state);
+			cpuhp_reset_state(cpu, st, prev_state);
 			__cpuhp_kick_ap(st);
 		} else {
 			WARN(1, "DEAD callback error for CPU%d", cpu);
@@ -1352,7 +1352,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	cpuhp_set_state(st, target);
+	cpuhp_set_state(cpu, st, target);
 	/*
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread once more.
-- 
cgit v1.2.3


From d308077e5e4dc8c93f97f5ebc70274e7c7a92d49 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Mon, 11 Apr 2022 16:22:33 +0100
Subject: cpu/hotplug: Initialise all cpuhp_cpu_state structs earlier

Rather than waiting until a CPU is first brought online, do the
initialisation of the cpuhp_cpu_state structure for each CPU during the
__init phase. This saves a (small) amount of non-__init memory and
avoids potential confusion about when the cpuhp_cpu_state struct is
valid.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Price <steven.price@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220411152233.474129-3-steven.price@arm.com
---
 kernel/cpu.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d0a9aa0b42e8..02a77ac08399 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -716,14 +716,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 /*
  * The cpu hotplug threads manage the bringup and teardown of the cpus
  */
-static void cpuhp_create(unsigned int cpu)
-{
-	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-
-	init_completion(&st->done_up);
-	init_completion(&st->done_down);
-}
-
 static int cpuhp_should_run(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
@@ -883,15 +875,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 
 static struct smp_hotplug_thread cpuhp_threads = {
 	.store			= &cpuhp_state.thread,
-	.create			= &cpuhp_create,
 	.thread_should_run	= cpuhp_should_run,
 	.thread_fn		= cpuhp_thread_fun,
 	.thread_comm		= "cpuhp/%u",
 	.selfparking		= true,
 };
 
+static __init void cpuhp_init_state(void)
+{
+	struct cpuhp_cpu_state *st;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		st = per_cpu_ptr(&cpuhp_state, cpu);
+		init_completion(&st->done_up);
+		init_completion(&st->done_down);
+	}
+}
+
 void __init cpuhp_threads_init(void)
 {
+	cpuhp_init_state();
 	BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
 	kthread_unpark(this_cpu_read(cpuhp_state.thread));
 }
-- 
cgit v1.2.3


From 2900005ea287b11dcc8c1b9fcf24893b7ff41d6d Mon Sep 17 00:00:00 2001
From: Yan Zhu <zhuyan34@huawei.com>
Date: Thu, 7 Apr 2022 15:07:59 +0800
Subject: bpf: Move BPF sysctls from kernel/sysctl.c to BPF core

We're moving sysctls out of kernel/sysctl.c as it is a mess. We
already moved all filesystem sysctls out. And with time the goal
is to move all sysctls out to their own subsystem/actual user.

kernel/sysctl.c has grown to an insane mess and its easy to run
into conflicts with it. The effort to move them out into various
subsystems is part of this.

Signed-off-by: Yan Zhu <zhuyan34@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/bpf/20220407070759.29506-1-zhuyan34@huawei.com
---
 kernel/bpf/syscall.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sysctl.c      | 79 -----------------------------------------------
 2 files changed, 87 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cdaa1152436a..e9621cfa09f2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4908,3 +4908,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
 const struct bpf_prog_ops bpf_syscall_prog_ops = {
 	.test_run = bpf_prog_test_run_syscall,
 };
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+			     void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct static_key *key = (struct static_key *)table->data;
+	static int saved_val;
+	int val, ret;
+	struct ctl_table tmp = {
+		.data   = &val,
+		.maxlen = sizeof(val),
+		.mode   = table->mode,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
+	};
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&bpf_stats_enabled_mutex);
+	val = saved_val;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret && val != saved_val) {
+		if (val)
+			static_key_slow_inc(key);
+		else
+			static_key_slow_dec(key);
+		saved_val = val;
+	}
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+			      void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, unpriv_enable = *(int *)table->data;
+	bool locked_state = unpriv_enable == 1;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &unpriv_enable;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		if (locked_state && unpriv_enable != 1)
+			return -EPERM;
+		*(int *)table->data = unpriv_enable;
+	}
+
+	unpriv_ebpf_notify(unpriv_enable);
+
+	return ret;
+}
+
+static struct ctl_table bpf_syscall_table[] = {
+	{
+		.procname	= "unprivileged_bpf_disabled",
+		.data		= &sysctl_unprivileged_bpf_disabled,
+		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
+		.mode		= 0644,
+		.proc_handler	= bpf_unpriv_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "bpf_stats_enabled",
+		.data		= &bpf_stats_enabled_key.key,
+		.maxlen		= sizeof(bpf_stats_enabled_key),
+		.mode		= 0644,
+		.proc_handler	= bpf_stats_handler,
+	},
+	{ }
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+	register_sysctl_init("kernel", bpf_syscall_table);
+	return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 830aaf8ca08e..47139877f62d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,7 +62,6 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
-#include <linux/bpf.h>
 #include <linux/mount.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/latencytop.h>
@@ -148,66 +147,6 @@ static const int max_extfrag_threshold = 1000;
 
 #endif /* CONFIG_SYSCTL */
 
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int bpf_stats_handler(struct ctl_table *table, int write,
-			     void *buffer, size_t *lenp, loff_t *ppos)
-{
-	struct static_key *key = (struct static_key *)table->data;
-	static int saved_val;
-	int val, ret;
-	struct ctl_table tmp = {
-		.data   = &val,
-		.maxlen = sizeof(val),
-		.mode   = table->mode,
-		.extra1 = SYSCTL_ZERO,
-		.extra2 = SYSCTL_ONE,
-	};
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&bpf_stats_enabled_mutex);
-	val = saved_val;
-	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
-	if (write && !ret && val != saved_val) {
-		if (val)
-			static_key_slow_inc(key);
-		else
-			static_key_slow_dec(key);
-		saved_val = val;
-	}
-	mutex_unlock(&bpf_stats_enabled_mutex);
-	return ret;
-}
-
-void __weak unpriv_ebpf_notify(int new_state)
-{
-}
-
-static int bpf_unpriv_handler(struct ctl_table *table, int write,
-			      void *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret, unpriv_enable = *(int *)table->data;
-	bool locked_state = unpriv_enable == 1;
-	struct ctl_table tmp = *table;
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	tmp.data = &unpriv_enable;
-	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
-	if (write && !ret) {
-		if (locked_state && unpriv_enable != 1)
-			return -EPERM;
-		*(int *)table->data = unpriv_enable;
-	}
-
-	unpriv_ebpf_notify(unpriv_enable);
-
-	return ret;
-}
-#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
-
 /*
  * /proc/sys support
  */
@@ -2299,24 +2238,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
-#ifdef CONFIG_BPF_SYSCALL
-	{
-		.procname	= "unprivileged_bpf_disabled",
-		.data		= &sysctl_unprivileged_bpf_disabled,
-		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
-		.mode		= 0644,
-		.proc_handler	= bpf_unpriv_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_TWO,
-	},
-	{
-		.procname	= "bpf_stats_enabled",
-		.data		= &bpf_stats_enabled_key.key,
-		.maxlen		= sizeof(bpf_stats_enabled_key),
-		.mode		= 0644,
-		.proc_handler	= bpf_stats_handler,
-	},
-#endif
 #if defined(CONFIG_TREE_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
-- 
cgit v1.2.3


From 241d50ec5d79b94694adf13853c1f55d0f0b85e6 Mon Sep 17 00:00:00 2001
From: Yu Zhe <yuzhe@nfschina.com>
Date: Tue, 12 Apr 2022 18:50:48 -0700
Subject: bpf: Remove unnecessary type castings

Remove/clean up unnecessary void * type castings.

Signed-off-by: Yu Zhe <yuzhe@nfschina.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220413015048.12319-1-yuzhe@nfschina.com
---
 kernel/bpf/bpf_struct_ops.c | 4 ++--
 kernel/bpf/hashtab.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 21069dbe9138..de01d37c2d3b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -263,7 +263,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	/* No lock is needed.  state and refcnt do not need
 	 * to be updated together under atomic context.
 	 */
-	uvalue = (struct bpf_struct_ops_value *)value;
+	uvalue = value;
 	memcpy(uvalue, st_map->uvalue, map->value_size);
 	uvalue->state = state;
 	refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
@@ -353,7 +353,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (err)
 		return err;
 
-	uvalue = (struct bpf_struct_ops_value *)value;
+	uvalue = value;
 	err = check_zero_holes(t, uvalue->data);
 	if (err)
 		return err;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 65877967f414..c68fbebc8c00 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -738,7 +738,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
  */
 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 {
-	struct bpf_htab *htab = (struct bpf_htab *)arg;
+	struct bpf_htab *htab = arg;
 	struct htab_elem *l = NULL, *tgt_l;
 	struct hlist_nulls_head *head;
 	struct hlist_nulls_node *n;
-- 
cgit v1.2.3


From 9e02977bfad006af328add9434c8bffa40e053bb Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Wed, 13 Apr 2022 08:32:22 +0200
Subject: dma-direct: avoid redundant memory sync for swiotlb

When we looked into FIO performance with swiotlb enabled in VM, we found
swiotlb_bounce() is always called one more time than expected for each DMA
read request.

It turns out that the bounce buffer is copied to original DMA buffer twice
after the completion of a DMA request (one is done by in
dma_direct_sync_single_for_cpu(), the other by swiotlb_tbl_unmap_single()).
But the content in bounce buffer actually doesn't change between the two
rounds of copy. So, one round of copy is redundant.

Pass DMA_ATTR_SKIP_CPU_SYNC flag to swiotlb_tbl_unmap_single() to
skip the memory copy in it.

This fix increases FIO 64KB sequential read throughput in a guest with
swiotlb=force by 5.6%.

Fixes: 55897af63091 ("dma-direct: merge swiotlb_dma_ops into the dma_direct code")
Reported-by: Wang Zhaoyang1 <zhaoyang1.wang@intel.com>
Reported-by: Gao Liang <liang.gao@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72e..8a6cd53dbe8c 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(dev, phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir,
+					 attrs | DMA_ATTR_SKIP_CPU_SYNC);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
-- 
cgit v1.2.3


From 9554e908fb5d02e48a681d1eca180225bf109e83 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Fri, 25 Mar 2022 11:39:51 -0400
Subject: ELF: Remove elf_core_copy_kernel_regs()

x86-32 was the last architecture that implemented separate user and
kernel registers.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220325153953.162643-3-brgerst@gmail.com
---
 kernel/kexec_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 68480f731192..be4b54c2c615 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1078,7 +1078,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 		return;
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.common.pr_pid = current->pid;
-	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+	elf_core_copy_regs(&prstatus.pr_reg, regs);
 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
 			      &prstatus, sizeof(prstatus));
 	final_note(buf);
-- 
cgit v1.2.3


From c48c8b829d2b966a6649827426bcdba082ccf922 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 14 Apr 2022 15:00:11 +0100
Subject: genirq: Take the proposed affinity at face value if force==true

Although setting the affinity of an interrupt to a set of CPUs that doesn't
have any online CPU is generally frowned apon, there are a few limited
cases where such affinity is set from a CPUHP notifier, setting the
affinity to a CPU that isn't online yet.

The saving grace is that this is always done using the 'force' attribute,
which gives a hint that the affinity setting can be outside of the online
CPU mask and the callsite set this flag with the knowledge that the
underlying interrupt controller knows to handle it.

This restores the expected behaviour on Marek's system.

Fixes: 33de0aa4bae9 ("genirq: Always limit the affinity to online CPUs")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/4b7fc13c-887b-a664-26e8-45aed13f048a@samsung.com
Link: https://lore.kernel.org/r/20220414140011.541725-1-maz@kernel.org
---
 kernel/irq/manage.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f71ecc100545..f1d5a94c6c9f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -266,10 +266,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 		prog_mask = mask;
 	}
 
-	/* Make sure we only provide online CPUs to the irqchip */
+	/*
+	 * Make sure we only provide online CPUs to the irqchip,
+	 * unless we are being asked to force the affinity (in which
+	 * case we do as we are told).
+	 */
 	cpumask_and(&tmp_mask, prog_mask, cpu_online_mask);
-	if (!cpumask_empty(&tmp_mask))
+	if (!force && !cpumask_empty(&tmp_mask))
 		ret = chip->irq_set_affinity(data, &tmp_mask, force);
+	else if (force)
+		ret = chip->irq_set_affinity(data, mask, force);
 	else
 		ret = -EINVAL;
 
-- 
cgit v1.2.3


From 3dc6ffae2da201284cb24af66af77ee0bbb2efaa Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Thu, 14 Apr 2022 11:18:03 +0200
Subject: timekeeping: Introduce fast accessor to clock tai

Introduce fast/NMI safe accessor to clock tai for tracing. The Linux kernel
tracing infrastructure has support for using different clocks to generate
timestamps for trace events. Especially in TSN networks it's useful to have TAI
as trace clock, because the application scheduling is done in accordance to the
network time, which is based on TAI. With a tai trace_clock in place, it becomes
very convenient to correlate network activity with Linux kernel application
traces.

Use the same implementation as ktime_get_boot_fast_ns() does by reading the
monotonic time and adding the TAI offset. The same limitations as for the fast
boot implementation apply. The TAI offset may change at run time e.g., by
setting the time or using adjtimex() with an offset. However, these kind of
offset changes are rare events. Nevertheless, the user has to be aware and deal
with it in post processing.

An alternative approach would be to use the same implementation as
ktime_get_real_fast_ns() does. However, this requires to add an additional u64
member to the tk_read_base struct. This struct together with a seqcount is
designed to fit into a single cache line on 64 bit architectures. Adding a new
member would violate this constraint.

Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220414091805.89667-2-kurt@linutronix.de
---
 kernel/time/timekeeping.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dcdcb85121e4..2c22023fbf5f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -532,6 +532,23 @@ u64 notrace ktime_get_boot_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
+/**
+ * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
+ *
+ * The same limitations as described for ktime_get_boot_fast_ns() apply. The
+ * mono time and the TAI offset are not read atomically which may yield wrong
+ * readouts. However, an update of the TAI offset is an rare event e.g., caused
+ * by settime or adjtimex with an offset. The user of this function has to deal
+ * with the possibility of wrong timestamps in post processing.
+ */
+u64 notrace ktime_get_tai_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
+}
+EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
+
 static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 {
 	struct tk_read_base *tkr;
-- 
cgit v1.2.3


From 5d79fa0d33258d8e79064316ce8fae37e27bd34b Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 7 Apr 2022 15:46:12 +0800
Subject: ftrace: Fix build warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_SYSCTL and CONFIG_DYNAMIC_FTRACE is n, build warns:

kernel/trace/ftrace.c:7912:13: error: ‘is_permanent_ops_registered’ defined but not used [-Werror=unused-function]
 static bool is_permanent_ops_registered(void)
             ^~~~~~~~~~~~~~~~~~~~~~~~~~~
kernel/trace/ftrace.c:89:12: error: ‘last_ftrace_enabled’ defined but not used [-Werror=unused-variable]
 static int last_ftrace_enabled;
            ^~~~~~~~~~~~~~~~~~~

Move is_permanent_ops_registered() to ifdef block and mark last_ftrace_enabled as
__maybe_unused to fix this.

Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a5efbbc289b4..db8d553728b6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -86,7 +86,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = {
 
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
-static int last_ftrace_enabled;
+static int __maybe_unused last_ftrace_enabled;
 
 /* Current function tracing op */
 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
@@ -7909,6 +7909,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 
+#ifdef CONFIG_SYSCTL
 static bool is_permanent_ops_registered(void)
 {
 	struct ftrace_ops *op;
@@ -7921,7 +7922,6 @@ static bool is_permanent_ops_registered(void)
 	return false;
 }
 
-#ifdef CONFIG_SYSCTL
 static int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
 		     void *buffer, size_t *lenp, loff_t *ppos)
-- 
cgit v1.2.3


From 25934fcfb93c4687ad32fd3d062bcf03457129d4 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Thu, 14 Apr 2022 19:13:34 -0700
Subject: irq_work: use kasan_record_aux_stack_noalloc() record callstack

On PREEMPT_RT kernel and KASAN is enabled.  the kasan_record_aux_stack()
may call alloc_pages(), and the rt-spinlock will be acquired, if currently
in atomic context, will trigger warning:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
  in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 239, name: bootlogd
  Preemption disabled at:
  [<ffffffffbab1a531>] rt_mutex_slowunlock+0xa1/0x4e0
  CPU: 3 PID: 239 Comm: bootlogd Tainted: G        W 5.17.1-rt17-yocto-preempt-rt+ #105
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014
  Call Trace:
     __might_resched.cold+0x13b/0x173
     rt_spin_lock+0x5b/0xf0
     get_page_from_freelist+0x20c/0x1610
     __alloc_pages+0x25e/0x5e0
     __stack_depot_save+0x3c0/0x4a0
     kasan_save_stack+0x3a/0x50
     __kasan_record_aux_stack+0xb6/0xc0
     kasan_record_aux_stack+0xe/0x10
     irq_work_queue_on+0x6a/0x1c0
     pull_rt_task+0x631/0x6b0
     do_balance_callbacks+0x56/0x80
     __balance_callbacks+0x63/0x90
     rt_mutex_setprio+0x349/0x880
     rt_mutex_slowunlock+0x22a/0x4e0
     rt_spin_unlock+0x49/0x80
     uart_write+0x186/0x2b0
     do_output_char+0x2e9/0x3a0
     n_tty_write+0x306/0x800
     file_tty_write.isra.0+0x2af/0x450
     tty_write+0x22/0x30
     new_sync_write+0x27c/0x3a0
     vfs_write+0x3f7/0x5d0
     ksys_write+0xd9/0x180
     __x64_sys_write+0x43/0x50
     do_syscall_64+0x44/0x90
     entry_SYSCALL_64_after_hwframe+0x44/0xae

Fix it by using kasan_record_aux_stack_noalloc() to avoid the call to
alloc_pages().

Link: https://lkml.kernel.org/r/20220402142555.2699582-1-qiang1.zhang@intel.com
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f7df715ec28e..7afa40fe5cc4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (!irq_work_claim(work))
 		return false;
 
-	kasan_record_aux_stack(work);
+	kasan_record_aux_stack_noalloc(work);
 
 	preempt_disable();
 	if (cpu != smp_processor_id()) {
-- 
cgit v1.2.3


From 07410559f38360885e91cff1b800168681ac515c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Feb 2022 09:44:32 +0100
Subject: dma-direct: use is_swiotlb_active in dma_direct_map_page

Use the more specific is_swiotlb_active check instead of checking the
global swiotlb_force variable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/direct.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 8a6cd53dbe8c..a78c0ba70645 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -91,7 +91,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
 		return swiotlb_map(dev, phys, size, dir, attrs);
 
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		if (swiotlb_force != SWIOTLB_NO_FORCE)
+		if (is_swiotlb_active(dev))
 			return swiotlb_map(dev, phys, size, dir, attrs);
 
 		dev_WARN_ONCE(dev, 1,
-- 
cgit v1.2.3


From 3469d36d470df148e8d940c1a6399510562bf3b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Feb 2022 10:01:26 +0100
Subject: swiotlb: make swiotlb_exit a no-op if SWIOTLB_FORCE is set

If force bouncing is enabled we can't release the buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 73a41cec9e38..98bb0eb44a7b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -369,6 +369,9 @@ void __init swiotlb_exit(void)
 	unsigned long tbl_vaddr;
 	size_t tbl_size, slots_size;
 
+	if (swiotlb_force == SWIOTLB_FORCE)
+		return;
+
 	if (!mem->nslabs)
 		return;
 
-- 
cgit v1.2.3


From a2daa27c0c6137481226aee5b3136e453c642929 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Feb 2022 11:44:42 +0100
Subject: swiotlb: simplify swiotlb_max_segment

Remove the bogus Xen override that was usually larger than the actual
size and just calculate the value on demand.  Note that
swiotlb_max_segment still doesn't make sense as an interface and should
eventually be removed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 98bb0eb44a7b..e0127e397335 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -68,12 +68,6 @@ struct io_tlb_mem io_tlb_default_mem;
 
 phys_addr_t swiotlb_unencrypted_base;
 
-/*
- * Max segment that we can provide which (if pages are contingous) will
- * not be bounced (unless SWIOTLB_FORCE is set).
- */
-static unsigned int max_segment;
-
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
 
 static int __init
@@ -97,18 +91,12 @@ early_param("swiotlb", setup_io_tlb_npages);
 
 unsigned int swiotlb_max_segment(void)
 {
-	return io_tlb_default_mem.nslabs ? max_segment : 0;
+	if (!io_tlb_default_mem.nslabs)
+		return 0;
+	return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
 }
 EXPORT_SYMBOL_GPL(swiotlb_max_segment);
 
-void swiotlb_set_max_segment(unsigned int val)
-{
-	if (swiotlb_force == SWIOTLB_FORCE)
-		max_segment = 1;
-	else
-		max_segment = rounddown(val, PAGE_SIZE);
-}
-
 unsigned long swiotlb_size_or_default(void)
 {
 	return default_nslabs << IO_TLB_SHIFT;
@@ -258,7 +246,6 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 
 	if (verbose)
 		swiotlb_print_info();
-	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
 }
 
@@ -359,7 +346,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
 
 	swiotlb_print_info();
-	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0d5ffd9a256d8995764f9d4a35a8c3917839d169 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Feb 2022 11:07:28 +0100
Subject: swiotlb: rename swiotlb_late_init_with_default_size

swiotlb_late_init_with_default_size is an overly verbose name that
doesn't even catch what the function is doing, given that the size is
not just a default but the actual requested size.

Rename it to swiotlb_init_late.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e0127e397335..9a4fe6e48a07 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -281,11 +281,9 @@ fail:
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int
-swiotlb_late_init_with_default_size(size_t default_size)
+int swiotlb_init_late(size_t size)
 {
-	unsigned long nslabs =
-		ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned long bytes;
 	unsigned char *vstart = NULL;
 	unsigned int order;
-- 
cgit v1.2.3


From c6af2aa9ffc9763826607bc2664ef3ea4475ed18 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Mar 2022 17:27:33 +0200
Subject: swiotlb: make the swiotlb_init interface more useful

Pass a boolean flag to indicate if swiotlb needs to be enabled based on
the addressing needs, and replace the verbose argument with a set of
flags, including one to force enable bounce buffering.

Note that this patch removes the possibility to force xen-swiotlb use
with the swiotlb=force parameter on the command line on x86 (arm and
arm64 never supported that), but this interface will be restored shortly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9a4fe6e48a07..86e877a96b82 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,7 +62,8 @@
 
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
-enum swiotlb_force swiotlb_force;
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
 
 struct io_tlb_mem io_tlb_default_mem;
 
@@ -81,9 +82,9 @@ setup_io_tlb_npages(char *str)
 	if (*str == ',')
 		++str;
 	if (!strcmp(str, "force"))
-		swiotlb_force = SWIOTLB_FORCE;
+		swiotlb_force_bounce = true;
 	else if (!strcmp(str, "noforce"))
-		swiotlb_force = SWIOTLB_NO_FORCE;
+		swiotlb_force_disable = true;
 
 	return 0;
 }
@@ -202,7 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->index = 0;
 	mem->late_alloc = late_alloc;
 
-	if (swiotlb_force == SWIOTLB_FORCE)
+	if (swiotlb_force_bounce)
 		mem->force_bounce = true;
 
 	spin_lock_init(&mem->lock);
@@ -224,12 +225,13 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	return;
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs,
+		unsigned int flags)
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	size_t alloc_size;
 
-	if (swiotlb_force == SWIOTLB_NO_FORCE)
+	if (swiotlb_force_disable)
 		return 0;
 
 	/* protect against double initialization */
@@ -243,8 +245,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		      __func__, alloc_size, PAGE_SIZE);
 
 	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
+	mem->force_bounce = flags & SWIOTLB_FORCE;
 
-	if (verbose)
+	if (flags & SWIOTLB_VERBOSE)
 		swiotlb_print_info();
 	return 0;
 }
@@ -253,20 +256,21 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
  */
-void  __init
-swiotlb_init(int verbose)
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
 {
 	size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
 	void *tlb;
 
-	if (swiotlb_force == SWIOTLB_NO_FORCE)
+	if (!addressing_limit && !swiotlb_force_bounce)
+		return;
+	if (swiotlb_force_disable)
 		return;
 
 	/* Get IO TLB memory from the low pages */
 	tlb = memblock_alloc_low(bytes, PAGE_SIZE);
 	if (!tlb)
 		goto fail;
-	if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
+	if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
 		goto fail_free_mem;
 	return;
 
@@ -289,7 +293,7 @@ int swiotlb_init_late(size_t size)
 	unsigned int order;
 	int rc = 0;
 
-	if (swiotlb_force == SWIOTLB_NO_FORCE)
+	if (swiotlb_force_disable)
 		return 0;
 
 	/*
@@ -328,7 +332,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long bytes = nslabs << IO_TLB_SHIFT;
 
-	if (swiotlb_force == SWIOTLB_NO_FORCE)
+	if (swiotlb_force_disable)
 		return 0;
 
 	/* protect against double initialization */
@@ -353,7 +357,7 @@ void __init swiotlb_exit(void)
 	unsigned long tbl_vaddr;
 	size_t tbl_size, slots_size;
 
-	if (swiotlb_force == SWIOTLB_FORCE)
+	if (swiotlb_force_bounce)
 		return;
 
 	if (!mem->nslabs)
@@ -704,8 +708,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	phys_addr_t swiotlb_addr;
 	dma_addr_t dma_addr;
 
-	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
-			      swiotlb_force);
+	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
 
 	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
 			attrs);
-- 
cgit v1.2.3


From 8ba2ed1be90fc210126f68186564707478552c95 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Feb 2022 13:36:57 +0200
Subject: swiotlb: add a SWIOTLB_ANY flag to lift the low memory restriction

Power SVM wants to allocate a swiotlb buffer that is not restricted to
low memory for the trusted hypervisor scheme.  Consolidate the support
for this into the swiotlb_init interface by adding a new flag.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 86e877a96b82..f6e091424af3 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -266,8 +266,15 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
 	if (swiotlb_force_disable)
 		return;
 
-	/* Get IO TLB memory from the low pages */
-	tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+	/*
+	 * By default allocate the bounce buffer memory from low memory, but
+	 * allow to pick a location everywhere for hypervisors with guest
+	 * memory encryption.
+	 */
+	if (flags & SWIOTLB_ANY)
+		tlb = memblock_alloc(bytes, PAGE_SIZE);
+	else
+		tlb = memblock_alloc_low(bytes, PAGE_SIZE);
 	if (!tlb)
 		goto fail;
 	if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
-- 
cgit v1.2.3


From 742519538e6b07250c8085bbff4bd358bc03bf16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Feb 2022 11:12:59 +0100
Subject: swiotlb: pass a gfp_mask argument to swiotlb_init_late

Let the caller chose a zone to allocate from.  This will be used
later on by the xen-swiotlb initialization on arm.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f6e091424af3..119187afc65e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -292,7 +292,7 @@ fail:
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int swiotlb_init_late(size_t size)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 {
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned long bytes;
@@ -303,15 +303,12 @@ int swiotlb_init_late(size_t size)
 	if (swiotlb_force_disable)
 		return 0;
 
-	/*
-	 * Get IO TLB memory from the low pages
-	 */
 	order = get_order(nslabs << IO_TLB_SHIFT);
 	nslabs = SLABS_PER_PAGE << order;
 	bytes = nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-		vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+		vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
 						  order);
 		if (vstart)
 			break;
-- 
cgit v1.2.3


From 7374153d294eb51de5a81ac38ff1c4fef8927bec Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Mar 2022 08:02:57 +0100
Subject: swiotlb: provide swiotlb_init variants that remap the buffer

To shared more code between swiotlb and xen-swiotlb, offer a
swiotlb_init_remap interface and add a remap callback to
swiotlb_init_late that will allow Xen to remap the buffer without
duplicating much of the logic.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 119187afc65e..f6acfc7a41bf 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -256,9 +256,11 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs,
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
  */
-void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+		int (*remap)(void *tlb, unsigned long nslabs))
 {
-	size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+	unsigned long nslabs = default_nslabs;
+	size_t bytes;
 	void *tlb;
 
 	if (!addressing_limit && !swiotlb_force_bounce)
@@ -271,12 +273,23 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
 	 * allow to pick a location everywhere for hypervisors with guest
 	 * memory encryption.
 	 */
+retry:
+	bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
 	if (flags & SWIOTLB_ANY)
 		tlb = memblock_alloc(bytes, PAGE_SIZE);
 	else
 		tlb = memblock_alloc_low(bytes, PAGE_SIZE);
 	if (!tlb)
 		goto fail;
+	if (remap && remap(tlb, nslabs) < 0) {
+		memblock_free(tlb, PAGE_ALIGN(bytes));
+
+		nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+		if (nslabs < IO_TLB_MIN_SLABS)
+			panic("%s: Failed to remap %zu bytes\n",
+			      __func__, bytes);
+		goto retry;
+	}
 	if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
 		goto fail_free_mem;
 	return;
@@ -287,12 +300,18 @@ fail:
 	pr_warn("Cannot allocate buffer");
 }
 
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+	return swiotlb_init_remap(addressing_limit, flags, NULL);
+}
+
 /*
  * Systems with larger DMA zones (those that don't support ISA) can
  * initialize the swiotlb later using the slab allocator if needed.
  * This should be just like above, but with some error catching.
  */
-int swiotlb_init_late(size_t size, gfp_t gfp_mask)
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+		int (*remap)(void *tlb, unsigned long nslabs))
 {
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned long bytes;
@@ -303,6 +322,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 	if (swiotlb_force_disable)
 		return 0;
 
+retry:
 	order = get_order(nslabs << IO_TLB_SHIFT);
 	nslabs = SLABS_PER_PAGE << order;
 	bytes = nslabs << IO_TLB_SHIFT;
@@ -323,6 +343,16 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask)
 			(PAGE_SIZE << order) >> 20);
 		nslabs = SLABS_PER_PAGE << order;
 	}
+	if (remap)
+		rc = remap(vstart, nslabs);
+	if (rc) {
+		free_pages((unsigned long)vstart, order);
+
+		nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+		if (nslabs < IO_TLB_MIN_SLABS)
+			return rc;
+		goto retry;
+	}
 	rc = swiotlb_late_init_with_tbl(vstart, nslabs);
 	if (rc)
 		free_pages((unsigned long)vstart, order);
-- 
cgit v1.2.3


From 6424e31b1c050a25aea033206d5f626f3523448c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Mar 2022 07:41:04 +0100
Subject: swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl

No users left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 kernel/dma/swiotlb.c | 77 ++++++++++++++--------------------------------------
 1 file changed, 20 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f6acfc7a41bf..e2ef0864eb1e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -225,33 +225,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	return;
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs,
-		unsigned int flags)
-{
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	size_t alloc_size;
-
-	if (swiotlb_force_disable)
-		return 0;
-
-	/* protect against double initialization */
-	if (WARN_ON_ONCE(mem->nslabs))
-		return -ENOMEM;
-
-	alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
-	mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!mem->slots)
-		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-		      __func__, alloc_size, PAGE_SIZE);
-
-	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
-	mem->force_bounce = flags & SWIOTLB_FORCE;
-
-	if (flags & SWIOTLB_VERBOSE)
-		swiotlb_print_info();
-	return 0;
-}
-
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -259,7 +232,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs,
 void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 		int (*remap)(void *tlb, unsigned long nslabs))
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = default_nslabs;
+	size_t alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
 	size_t bytes;
 	void *tlb;
 
@@ -280,7 +255,8 @@ retry:
 	else
 		tlb = memblock_alloc_low(bytes, PAGE_SIZE);
 	if (!tlb)
-		goto fail;
+		panic("%s: failed to allocate tlb structure\n", __func__);
+
 	if (remap && remap(tlb, nslabs) < 0) {
 		memblock_free(tlb, PAGE_ALIGN(bytes));
 
@@ -290,14 +266,17 @@ retry:
 			      __func__, bytes);
 		goto retry;
 	}
-	if (swiotlb_init_with_tbl(tlb, default_nslabs, flags))
-		goto fail_free_mem;
-	return;
 
-fail_free_mem:
-	memblock_free(tlb, bytes);
-fail:
-	pr_warn("Cannot allocate buffer");
+	mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!mem->slots)
+		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+		      __func__, alloc_size, PAGE_SIZE);
+
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), default_nslabs, false);
+	mem->force_bounce = flags & SWIOTLB_FORCE;
+
+	if (flags & SWIOTLB_VERBOSE)
+		swiotlb_print_info();
 }
 
 void __init swiotlb_init(bool addressing_limit, unsigned int flags)
@@ -313,6 +292,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
 int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 		int (*remap)(void *tlb, unsigned long nslabs))
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned long bytes;
 	unsigned char *vstart = NULL;
@@ -353,33 +333,16 @@ retry:
 			return rc;
 		goto retry;
 	}
-	rc = swiotlb_late_init_with_tbl(vstart, nslabs);
-	if (rc)
-		free_pages((unsigned long)vstart, order);
-
-	return rc;
-}
-
-int
-swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
-{
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long bytes = nslabs << IO_TLB_SHIFT;
-
-	if (swiotlb_force_disable)
-		return 0;
-
-	/* protect against double initialization */
-	if (WARN_ON_ONCE(mem->nslabs))
-		return -ENOMEM;
 
 	mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 		get_order(array_size(sizeof(*mem->slots), nslabs)));
-	if (!mem->slots)
+	if (!mem->slots) {
+		free_pages((unsigned long)vstart, order);
 		return -ENOMEM;
+	}
 
-	set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
-	swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
+	set_memory_decrypted((unsigned long)vstart, bytes >> PAGE_SHIFT);
+	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
 
 	swiotlb_print_info();
 	return 0;
-- 
cgit v1.2.3


From 6c846d026d490b2383d395bc8e7b06336219667b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 19 Apr 2022 15:18:37 +0100
Subject: gpio: Don't fiddle with irqchips marked as immutable

In order to move away from gpiolib messing with the internals of
unsuspecting irqchips, add a flag by which irqchips advertise
that they are not to be messed with, and do solemnly swear that
they correctly call into the gpiolib helpers when required.

Also nudge the users into converting their drivers to the
new model.

Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220419141846.598305-2-maz@kernel.org
---
 kernel/irq/debugfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 2b43f5f5033d..bc8e40cf2b65 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
 	BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
 	BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
 	BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+	BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
 };
 
 static void
-- 
cgit v1.2.3


From 055eb95533273bc334794dbc598400d10800528f Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 14 Apr 2022 09:12:33 -0700
Subject: bpf: Move rcu lock management out of BPF_PROG_RUN routines

Commit 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros
into functions") switched a bunch of BPF_PROG_RUN macros to inline
routines. This changed the semantic a bit. Due to arguments expansion
of macros, it used to be:

	rcu_read_lock();
	array = rcu_dereference(cgrp->bpf.effective[atype]);
	...

Now, with with inline routines, we have:
	array_rcu = rcu_dereference(cgrp->bpf.effective[atype]);
	/* array_rcu can be kfree'd here */
	rcu_read_lock();
	array = rcu_dereference(array_rcu);

I'm assuming in practice rcu subsystem isn't fast enough to trigger
this but let's use rcu API properly.

Also, rename to lower caps to not confuse with macros. Additionally,
drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY.

See [1] for more context.

  [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u

v2
- keep rcu locks inside by passing cgroup_bpf

Fixes: 7d08c2c91171 ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com
---
 kernel/bpf/cgroup.c      | 124 ++++++++++++++++++++++++++++++++++++++++-------
 kernel/trace/bpf_trace.c |   5 +-
 2 files changed, 111 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 128028efda64..0cb6211fcb58 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -22,6 +22,72 @@
 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
+			    enum cgroup_bpf_attach_type atype,
+			    const void *ctx, bpf_prog_run_fn run_prog,
+			    int retval, u32 *ret_flags)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+	u32 func_ret;
+
+	run_ctx.retval = retval;
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(cgrp->effective[atype]);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		func_ret = run_prog(prog, ctx);
+		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
+			run_ctx.retval = -EPERM;
+		*(ret_flags) |= (func_ret >> 1);
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return run_ctx.retval;
+}
+
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+		      enum cgroup_bpf_attach_type atype,
+		      const void *ctx, bpf_prog_run_fn run_prog,
+		      int retval)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_cg_run_ctx run_ctx;
+
+	run_ctx.retval = retval;
+	migrate_disable();
+	rcu_read_lock();
+	array = rcu_dereference(cgrp->effective[atype]);
+	item = &array->items[0];
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	while ((prog = READ_ONCE(item->prog))) {
+		run_ctx.prog_item = item;
+		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
+			run_ctx.retval = -EPERM;
+		item++;
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+	rcu_read_unlock();
+	migrate_enable();
+	return run_ctx.retval;
+}
+
 void cgroup_bpf_offline(struct cgroup *cgrp)
 {
 	cgroup_get(cgrp);
@@ -1075,11 +1141,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
 	if (atype == CGROUP_INET_EGRESS) {
-		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
-			cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
+		u32 flags = 0;
+		bool cn;
+
+		ret = bpf_prog_run_array_cg_flags(
+			&cgrp->bpf, atype,
+			skb, __bpf_prog_run_save_cb, 0, &flags);
+
+		/* Return values of CGROUP EGRESS BPF programs are:
+		 *   0: drop packet
+		 *   1: keep packet
+		 *   2: drop packet and cn
+		 *   3: keep packet and cn
+		 *
+		 * The returned value is then converted to one of the NET_XMIT
+		 * or an error code that is then interpreted as drop packet
+		 * (and no cn):
+		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
+		 *   1: NET_XMIT_DROP     skb should be dropped and cn
+		 *   2: NET_XMIT_CN       skb should be transmitted and cn
+		 *   3: -err              skb should be dropped
+		 */
+
+		cn = flags & BPF_RET_SET_CN;
+		if (ret && !IS_ERR_VALUE((long)ret))
+			ret = -EFAULT;
+		if (!ret)
+			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+		else
+			ret = (cn ? NET_XMIT_DROP : ret);
 	} else {
-		ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
-					    __bpf_prog_run_save_cb, 0);
+		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+					    skb, __bpf_prog_run_save_cb, 0);
 		if (ret && !IS_ERR_VALUE((long)ret))
 			ret = -EFAULT;
 	}
@@ -1109,8 +1202,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
-	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
-				     bpf_prog_run, 0);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1155,8 +1247,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
-					   bpf_prog_run, 0, flags);
+	return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype,
+					   &ctx, bpf_prog_run, 0, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1182,8 +1274,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
-	return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
-				     bpf_prog_run, 0);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+				     0);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1200,8 +1292,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				    bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	return ret;
@@ -1366,8 +1457,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
-				    bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1459,7 +1549,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
 				    &ctx, bpf_prog_run, 0);
 	release_sock(sk);
 
@@ -1559,7 +1649,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	lock_sock(sk);
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
 				    &ctx, bpf_prog_run, retval);
 	release_sock(sk);
 
@@ -1608,7 +1698,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 * be called if that data shouldn't be "exported".
 	 */
 
-	ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
 				    &ctx, bpf_prog_run, retval);
 	if (ret < 0)
 		return ret;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b26f3da943de..f15b826f9899 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	 * out of events when it was updated in between this and the
 	 * rcu_dereference() which is accepted risk.
 	 */
-	ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
+	rcu_read_lock();
+	ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
+				 ctx, bpf_prog_run);
+	rcu_read_unlock();
 
  out:
 	__this_cpu_dec(bpf_prog_active);
-- 
cgit v1.2.3


From 40f5aa4c5eaebfeaca4566217cb9c468e28ed682 Mon Sep 17 00:00:00 2001
From: kuyo chang <kuyo.chang@mediatek.com>
Date: Thu, 14 Apr 2022 17:02:20 +0800
Subject: sched/pelt: Fix attach_entity_load_avg() corner case

The warning in cfs_rq_is_decayed() triggered:

    SCHED_WARN_ON(cfs_rq->avg.load_avg ||
		  cfs_rq->avg.util_avg ||
		  cfs_rq->avg.runnable_avg)

There exists a corner case in attach_entity_load_avg() which will
cause load_sum to be zero while load_avg will not be.

Consider se_weight is 88761 as per the sched_prio_to_weight[] table.
Further assume the get_pelt_divider() is 47742, this gives:
se->avg.load_avg is 1.

However, calculating load_sum:

  se->avg.load_sum = div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
  se->avg.load_sum = 1*47742/88761 = 0.

Then enqueue_load_avg() adds this to the cfs_rq totals:

  cfs_rq->avg.load_avg += se->avg.load_avg;
  cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;

Resulting in load_avg being 1 with load_sum is 0, which will trigger
the WARN.

Fixes: f207934fb79d ("sched/fair: Align PELT windows between cfs_rq and its se")
Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
[peterz: massage changelog]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lkml.kernel.org/r/20220414090229.342-1-kuyo.chang@mediatek.com
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bd299d67ab..a68482d66535 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3829,11 +3829,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 	se->avg.runnable_sum = se->avg.runnable_avg * divider;
 
-	se->avg.load_sum = divider;
-	if (se_weight(se)) {
-		se->avg.load_sum =
-			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
-	}
+	se->avg.load_sum = se->avg.load_avg * divider;
+	if (se_weight(se) < se->avg.load_sum)
+		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+	else
+		se->avg.load_sum = 1;
 
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
-- 
cgit v1.2.3


From 60490e7966659b26d74bf1fa4aa8693d9a94ca88 Mon Sep 17 00:00:00 2001
From: Zhipeng Xie <xiezhipeng1@huawei.com>
Date: Wed, 9 Feb 2022 09:54:17 -0500
Subject: perf/core: Fix perf_mmap fail when CONFIG_PERF_USE_VMALLOC enabled

This problem can be reproduced with CONFIG_PERF_USE_VMALLOC enabled on
both x86_64 and aarch64 arch when using sysdig -B(using ebpf)[1].
sysdig -B works fine after rebuilding the kernel with
CONFIG_PERF_USE_VMALLOC disabled.

I tracked it down to the if condition event->rb->nr_pages != nr_pages
in perf_mmap is true when CONFIG_PERF_USE_VMALLOC is enabled where
event->rb->nr_pages = 1 and nr_pages = 2048 resulting perf_mmap to
return -EINVAL. This is because when CONFIG_PERF_USE_VMALLOC is
enabled, rb->nr_pages is always equal to 1.

Arch with CONFIG_PERF_USE_VMALLOC enabled by default:
	arc/arm/csky/mips/sh/sparc/xtensa

Arch with CONFIG_PERF_USE_VMALLOC disabled by default:
	x86_64/aarch64/...

Fix this problem by using data_page_nr()

[1] https://github.com/draios/sysdig

Fixes: 906010b2134e ("perf_event: Provide vmalloc() based mmap() backing")
Signed-off-by: Zhipeng Xie <xiezhipeng1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220209145417.6495-1-xiezhipeng1@huawei.com
---
 kernel/events/core.c        | 2 +-
 kernel/events/internal.h    | 5 +++++
 kernel/events/ring_buffer.c | 5 -----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 23bb19716ad3..7858bafffa9d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6247,7 +6247,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 again:
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages != nr_pages) {
+		if (data_page_nr(event->rb) != nr_pages) {
 			ret = -EINVAL;
 			goto unlock;
 		}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 082832738c8f..5150d5f84c03 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb)
 }
 #endif
 
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+	return rb->nr_pages << page_order(rb);
+}
+
 static inline unsigned long perf_data_size(struct perf_buffer *rb)
 {
 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 52868716ec35..fb35b926024c 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -859,11 +859,6 @@ void rb_free(struct perf_buffer *rb)
 }
 
 #else
-static int data_page_nr(struct perf_buffer *rb)
-{
-	return rb->nr_pages << page_order(rb);
-}
-
 static struct page *
 __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
 {
-- 
cgit v1.2.3


From eb596b0905584a9389585b0f437cf8a2faeb14d0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Tue, 19 Apr 2022 22:16:07 +0530
Subject: bpf: Ensure type tags precede modifiers in BTF

It is guaranteed that for modifiers, clang always places type tags
before other modifiers, and then the base type. We would like to rely on
this guarantee inside the kernel to make it simple to parse type tags
from BTF.

However, a user would be allowed to construct a BTF without such
guarantees. Hence, add a pass to check that in modifier chains, type
tags only occur at the head of the chain, and then don't occur later in
the chain.

If we see a type tag, we can have one or more type tags preceding other
modifiers that then never have another type tag. If we see other
modifiers, all modifiers following them should never be a type tag.

Instead of having to walk chains we verified previously, we can remember
the last good modifier type ID which headed a good chain. At that point,
we must have verified all other chains headed by type IDs less than it.
This makes the verification process less costly, and it becomes a simple
O(n) pass.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220419164608.1990559-2-memxor@gmail.com
---
 kernel/bpf/btf.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0918a39279f6..7906b9bf7ff8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4541,6 +4541,48 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 	return 0;
 }
 
+static int btf_check_type_tags(struct btf_verifier_env *env,
+			       struct btf *btf, int start_id)
+{
+	int i, n, good_id = start_id - 1;
+	bool in_tags;
+
+	n = btf_nr_types(btf);
+	for (i = start_id; i < n; i++) {
+		const struct btf_type *t;
+		u32 cur_id = i;
+
+		t = btf_type_by_id(btf, i);
+		if (!t)
+			return -EINVAL;
+		if (!btf_type_is_modifier(t))
+			continue;
+
+		cond_resched();
+
+		in_tags = btf_type_is_type_tag(t);
+		while (btf_type_is_modifier(t)) {
+			if (btf_type_is_type_tag(t)) {
+				if (!in_tags) {
+					btf_verifier_log(env, "Type tags don't precede modifiers");
+					return -EINVAL;
+				}
+			} else if (in_tags) {
+				in_tags = false;
+			}
+			if (cur_id <= good_id)
+				break;
+			/* Move to next type */
+			cur_id = t->type;
+			t = btf_type_by_id(btf, cur_id);
+			if (!t)
+				return -EINVAL;
+		}
+		good_id = i;
+	}
+	return 0;
+}
+
 static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 			     u32 log_level, char __user *log_ubuf, u32 log_size)
 {
@@ -4608,6 +4650,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
+	err = btf_check_type_tags(env, btf, 1);
+	if (err)
+		goto errout;
+
 	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout;
@@ -4809,6 +4855,10 @@ struct btf *btf_parse_vmlinux(void)
 	if (err)
 		goto errout;
 
+	err = btf_check_type_tags(env, btf, 1);
+	if (err)
+		goto errout;
+
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
 	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
 
@@ -4894,6 +4944,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
 	if (err)
 		goto errout;
 
+	err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+	if (err)
+		goto errout;
+
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
 	return btf;
-- 
cgit v1.2.3


From dcf456c9a095a6e71f53d6f6f004133ee851ee70 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Mon, 18 Apr 2022 15:51:58 +0000
Subject: bpf: Fix usage of trace RCU in local storage.

bpf_{sk,task,inode}_storage_free() do not need to use
call_rcu_tasks_trace as no BPF program should be accessing the owner
as it's being destroyed. The only other reader at this point is
bpf_local_storage_map_free() which uses normal RCU.

The only path that needs trace RCU are:

* bpf_local_storage_{delete,update} helpers
* map_{delete,update}_elem() syscalls

Fixes: 0fe4b381a59e ("bpf: Allow bpf_local_storage to be used by sleepable programs")
Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220418155158.2865678-1-kpsingh@kernel.org
---
 kernel/bpf/bpf_inode_storage.c |  4 ++--
 kernel/bpf/bpf_local_storage.c | 29 +++++++++++++++++++----------
 kernel/bpf/bpf_task_storage.c  |  4 ++--
 3 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96be8d518885..10424a1cda51 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode)
 		 */
 		bpf_selem_unlink_map(selem);
 		free_inode_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false);
+			local_storage, selem, false, false);
 	}
 	raw_spin_unlock_bh(&local_storage->lock);
 	rcu_read_unlock();
@@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata));
+	bpf_selem_unlink(SELEM(sdata), true);
 
 	return 0;
 }
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 01aa2b51ec4d..8ce40fd869f6 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
  */
 bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 				     struct bpf_local_storage_elem *selem,
-				     bool uncharge_mem)
+				     bool uncharge_mem, bool use_trace_rcu)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+	if (use_trace_rcu)
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+	else
+		kfree_rcu(selem, rcu);
+
 	return free_local_storage;
 }
 
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+				       bool use_trace_rcu)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true);
+			local_storage, selem, true, use_trace_rcu);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	if (free_local_storage)
-		call_rcu_tasks_trace(&local_storage->rcu,
+	if (free_local_storage) {
+		if (use_trace_rcu)
+			call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_rcu);
+		else
+			kfree_rcu(local_storage, rcu);
+	}
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
 {
 	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
 	 * the local_storage.
 	 */
 	bpf_selem_unlink_map(selem);
-	__bpf_selem_unlink_storage(selem);
+	__bpf_selem_unlink_storage(selem, use_trace_rcu);
 }
 
 struct bpf_local_storage_data *
@@ -466,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false);
+						false, true);
 	}
 
 unlock:
@@ -548,7 +557,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
 				migrate_disable();
 				__this_cpu_inc(*busy_counter);
 			}
-			bpf_selem_unlink(selem);
+			bpf_selem_unlink(selem, false);
 			if (busy_counter) {
 				__this_cpu_dec(*busy_counter);
 				migrate_enable();
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 6638a0ecc3d2..57904263a710 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task)
 		 */
 		bpf_selem_unlink_map(selem);
 		free_task_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false);
+			local_storage, selem, false, false);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	bpf_task_storage_unlock();
@@ -192,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata));
+	bpf_selem_unlink(SELEM(sdata), true);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3791a22374715b36ad806db13d8b2afb1b57fd36 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 28 Feb 2022 18:08:33 -0800
Subject: kernel/smp: Provide boot-time timeout for CSD lock diagnostics

Debugging of problems involving insanely long-running SMI handlers
proceeds better if the CSD-lock timeout can be adjusted.  This commit
therefore provides a new smp.csd_lock_timeout kernel boot parameter
that specifies the timeout in milliseconds.  The default remains at the
previously hard-coded value of five seconds.

[ paulmck: Apply feedback from Juergen Gross. ]

Cc: Rik van Riel <riel@surriel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/smp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 01a7c1706a58..6a1f1daa3dc4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -183,7 +183,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
 static DEFINE_PER_CPU(void *, cur_csd_info);
 static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local);
 
-#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
+static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
+module_param(csd_lock_timeout, ulong, 0444);
+
 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 static u64 cfd_seq;
 
@@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	u64 ts2, ts_delta;
 	call_single_data_t *cpu_cur_csd;
 	unsigned int flags = READ_ONCE(csd->node.u_flags);
+	unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
 
 	if (!(flags & CSD_FLAG_LOCK)) {
 		if (!unlikely(*bug_id))
@@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 
 	ts2 = sched_clock();
 	ts_delta = ts2 - *ts1;
-	if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
+	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 		return false;
 
 	firsttime = !*bug_id;
-- 
cgit v1.2.3


From 75182a4eaaf8b697f66d68ad039f021f461dd2a4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Mar 2022 11:01:37 -0800
Subject: rcu: Add comments to final rcu_gp_cleanup() "if" statement

The final "if" statement in rcu_gp_cleanup() has proven to be rather
confusing, straightforward though it might have seemed when initially
written.  This commit therefore adds comments to its "then" and "else"
clauses to at least provide a more elevated form of confusion.

Reported-by: Boqun Feng <boqun.feng@gmail.com>
Reported-by: Frederic Weisbecker <frederic@kernel.org>
Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reported-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a5ea67454640..29669070348e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2098,14 +2098,29 @@ static noinline void rcu_gp_cleanup(void)
 	/* Advance CBs to reduce false positives below. */
 	offloaded = rcu_rdp_is_offloaded(rdp);
 	if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+		// We get here if a grace period was needed (“needgp”)
+		// and the above call to rcu_accelerate_cbs() did not set
+		// the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+		// the need for another grace period).  The purpose
+		// of the “offloaded” check is to avoid invoking
+		// rcu_accelerate_cbs() on an offloaded CPU because we do not
+		// hold the ->nocb_lock needed to safely access an offloaded
+		// ->cblist.  We do not want to acquire that lock because
+		// it can be heavily contended during callback floods.
+
 		WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
 		WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
-		trace_rcu_grace_period(rcu_state.name,
-				       rcu_state.gp_seq,
-				       TPS("newreq"));
+		trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
 	} else {
-		WRITE_ONCE(rcu_state.gp_flags,
-			   rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+
+		// We get here either if there is no need for an
+		// additional grace period or if rcu_accelerate_cbs() has
+		// already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+		// So all we need to do is to clear all of the other
+		// ->gp_flags bits.
+
+		WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
 	}
 	raw_spin_unlock_irq_rcu_node(rnp);
 
-- 
cgit v1.2.3


From 80d530b47da41642fab317a9485d58dfbe1e8896 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 11 Mar 2022 10:10:35 -0800
Subject: rcu: Print number of online CPUs in RCU CPU stall-warning messages

RCU's synchronous grace periods act quite differently when there is
only one online CPU, especially in the no-op case in kernels built with
CONFIG_PREEMPTION=n.  This change in behavior can be important debugging
information, so this commit adds the number of online CPUs to the RCU
CPU stall warning messages.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_stall.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 0c5d8516516a..268dd79c58e7 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -565,9 +565,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 
 	for_each_possible_cpu(cpu)
 		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
 	       smp_processor_id(), (long)(jiffies - gps),
-	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
 	if (ndetected) {
 		rcu_dump_cpu_stacks();
 
@@ -626,9 +626,9 @@ static void print_cpu_stall(unsigned long gps)
 	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
 	for_each_possible_cpu(cpu)
 		totqlen += rcu_get_n_cbs_cpu(cpu);
-	pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+	pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
 		jiffies - gps,
-		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
 
 	rcu_check_gp_kthread_expired_fqs_timer();
 	rcu_check_gp_kthread_starvation();
-- 
cgit v1.2.3


From 70ae7b0ce03347fab35d6d8df81e1165d7ea8045 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 14 Mar 2022 14:37:38 +0100
Subject: rcu: Fix preemption mode check on synchronize_rcu[_expedited]()

An early check on synchronize_rcu[_expedited]() tries to determine if
the current CPU is in UP mode on an SMP no-preempt kernel, in which case
there is no need to start a grace period since the current assumed
quiescent state is all we need.

However the preemption mode doesn't take into account the boot selected
preemption mode under CONFIG_PREEMPT_DYNAMIC=y, missing a possible
early return if the running flavour is "none" or "voluntary".

Use the shiny new preempt mode accessors to fix this.  However,
avoid invoking them during early boot because doing so triggers a
WARN_ON_ONCE().

[ paulmck: Update for mainlined API. ]

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Uladzislau Rezki <uladzislau.rezki@sony.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 29669070348e..d3caa82b9954 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3741,7 +3741,9 @@ static int rcu_blocking_is_gp(void)
 {
 	int ret;
 
-	if (IS_ENABLED(CONFIG_PREEMPTION))
+	// Invoking preempt_model_*() too early gets a splat.
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
+	    preempt_model_full() || preempt_model_rt())
 		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
 	might_sleep();  /* Check for RCU read-side critical section. */
 	preempt_disable();
-- 
cgit v1.2.3


From 88ca472f80604c070526eb58b977ea0a9c3c2e1f Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Thu, 24 Mar 2022 19:15:15 +0800
Subject: rcu: Check for successful spawn of ->boost_kthread_task

For the spawning of the priority-boost kthreads can fail, improbable
though this might seem.  This commit therefore refrains from attemoting
to initiate RCU priority boosting when The ->boost_kthread_task pointer
is NULL.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 176639c6215f..5c23aceecd62 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1125,7 +1125,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
-	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+	if (!rnp->boost_kthread_task ||
+	    (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
-- 
cgit v1.2.3


From f1efe84d6fd2af163989025bc285ff9b3b0c764f Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Tue, 29 Mar 2022 15:26:13 -0700
Subject: rcu_sync: Fix comment to properly reflect rcu_sync_exit() behavior

The rcu_sync_enter() function is used by updaters to force RCU readers
(e.g. percpu-rwsem) to use their slow paths during an update.  This is
accomplished by setting the ->gp_state of the rcu_sync structure to
GP_ENTER.  In the case of percpu-rwsem, the readers' slow path waits on
a semaphore instead of just incrementing a reader count.  Each updater
invokes the rcu_sync_exit() function to signal to readers that they
may again take their fastpaths.  The rcu_sync_exit() function sets the
->gp_state of the rcu_sync structure to GP_EXIT, and if all goes well,
after a grace period the ->gp_state reverts back to GP_IDLE.

Unfortunately, the rcu_sync_enter() function currently has a comment
incorrectly stating that rcu_sync_exit() (by an updater) will re-enable
reader "slowpaths".  This patch changes the comment to state that this
function re-enables reader fastpaths.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 33d896d85902..5cefc702158f 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
  * a slowpath during the update.  After this function returns, all
  * subsequent calls to rcu_sync_is_idle() will return false, which
  * tells readers to stay off their fastpaths.  A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
+ * rcu_sync_exit() re-enables reader fastpaths.
  *
  * When called in isolation, rcu_sync_enter() must wait for a grace
  * period, however, closely spaced calls to rcu_sync_enter() can
-- 
cgit v1.2.3


From f596e2ce1c0f250bb3ecc179f611be37e862635f Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Mon, 4 Apr 2022 07:59:32 +0800
Subject: rcu: Use IRQ_WORK_INIT_HARD() to avoid rcu_read_unlock() hangs

When booting kernels built with both CONFIG_RCU_STRICT_GRACE_PERIOD=y
and CONFIG_PREEMPT_RT=y, the rcu_read_unlock_special() function's
invocation of irq_work_queue_on() the init_irq_work() causes the
rcu_preempt_deferred_qs_handler() function to work execute in SCHED_FIFO
irq_work kthreads.  Because rcu_read_unlock_special() is invoked on each
rcu_read_unlock() in such kernels, the amount of work just keeps piling
up, resulting in a boot-time hang.

This commit therefore avoids this hang by using IRQ_WORK_INIT_HARD()
instead of init_irq_work(), but only in kernels built with both
CONFIG_PREEMPT_RT=y and CONFIG_RCU_STRICT_GRACE_PERIOD=y.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5c23aceecd62..2a3715419073 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -661,7 +661,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
 			    expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
 				// Get scheduler to re-evaluate and call hooks.
 				// If !IRQ_WORK, FQS scan will eventually IPI.
-				init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler);
+				if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+				    IS_ENABLED(CONFIG_PREEMPT_RT))
+					rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
+								rcu_preempt_deferred_qs_handler);
+				else
+					init_irq_work(&rdp->defer_qs_iw,
+						      rcu_preempt_deferred_qs_handler);
 				rdp->defer_qs_iw_pending = true;
 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 			}
-- 
cgit v1.2.3


From 835f14ed53076384f0e1dad2fddb4881315f124f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Mar 2022 11:05:09 -0700
Subject: rcu: Make the TASKS_RCU Kconfig option be selected

Currently, any kernel built with CONFIG_PREEMPTION=y also gets
CONFIG_TASKS_RCU=y, which is not helpful to people trying to build
preemptible kernels of minimal size.

Because CONFIG_TASKS_RCU=y is needed only in kernels doing tracing of
one form or another, this commit moves from TASKS_RCU deciding when it
should be enabled to the tracing Kconfig options explicitly selecting it.
This allows building preemptible kernels without TASKS_RCU, if desired.

This commit also updates the SRCU-N and TREE09 rcutorture scenarios
in order to avoid Kconfig errors that would otherwise result from
CONFIG_TASKS_RCU being selected without its CONFIG_RCU_EXPERT dependency
being met.

[ paulmck: Apply BPF_SYSCALL feedback from Andrii Nakryiko. ]

Reported-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/bpf/Kconfig   | 1 +
 kernel/rcu/Kconfig   | 3 ++-
 kernel/trace/Kconfig | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d56ee177d5f8..2dfe1079f772 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -27,6 +27,7 @@ config BPF_SYSCALL
 	bool "Enable bpf() system call"
 	select BPF
 	select IRQ_WORK
+	select TASKS_RCU if PREEMPTION
 	select TASKS_TRACE_RCU
 	select BINARY_PRINTF
 	select NET_SOCK_MSG if NET
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index f559870fbf8b..4f665ae0cf55 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -78,7 +78,8 @@ config TASKS_RCU_GENERIC
 	  task-based RCU implementations.  Not for manual selection.
 
 config TASKS_RCU
-	def_bool PREEMPTION
+	def_bool 0
+	select IRQ_WORK
 	help
 	  This option enables a task-based RCU implementation that uses
 	  only voluntary context switch (not preemption!), idle, and
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2c43e327a619..bf5da6c4e999 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -144,6 +144,7 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 	select TRACE_CLOCK
+	select TASKS_RCU if PREEMPTION
 
 config GENERIC_TRACER
 	bool
-- 
cgit v1.2.3


From 40c1278aa7cd51d4f8627f7adc66aa73e01aff81 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Mar 2022 13:29:59 -0700
Subject: rcutorture: Allow rcutorture without RCU Tasks Trace

Unless a kernel builds rcutorture, whether built-in or as a module, that
kernel is also built with CONFIG_TASKS_TRACE_RCU, whether anything else
needs Tasks Trace RCU or not.  This unnecessarily increases kernel size.
This commit therefore decouples the presence of rcutorture from the
presence of RCU Tasks Trace.

However, there is a need to select CONFIG_TASKS_TRACE_RCU for
testing purposes.  Except that casual users must not be bothered with
questions -- for them, this needs to be fully automated.  There is thus
a CONFIG_FORCE_TASKS_TRACE_RCU that selects CONFIG_TASKS_TRACE_RCU,
is user-selectable, but which depends on CONFIG_RCU_EXPERT.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig       |  22 +++++++----
 kernel/rcu/Kconfig.debug |   1 -
 kernel/rcu/rcutorture.c  | 101 ++++++++++++++++++++++++++---------------------
 3 files changed, 71 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 4f665ae0cf55..2befd328e6a0 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -95,15 +95,23 @@ config TASKS_RUDE_RCU
 	  switches on all online CPUs, including idle ones, so use
 	  with caution.
 
-config TASKS_TRACE_RCU
-	def_bool 0
-	select IRQ_WORK
+config FORCE_TASKS_TRACE_RCU
+	bool "Force selection of Tasks Trace RCU"
+	depends on RCU_EXPERT
+	select TASKS_TRACE_RCU
+	default n
 	help
 	  This option enables a task-based RCU implementation that uses
 	  explicit rcu_read_lock_trace() read-side markers, and allows
-	  these readers to appear in the idle loop as well as on the CPU
-	  hotplug code paths.  It can force IPIs on online CPUs, including
-	  idle ones, so use with caution.
+	  these readers to appear in the idle loop as well as on the
+	  CPU hotplug code paths.  It can force IPIs on online CPUs,
+	  including idle ones, so use with caution.  Not for manual
+	  selection in most cases.
+
+config TASKS_TRACE_RCU
+	bool
+	default n
+	select IRQ_WORK
 
 config RCU_STALL_COMMON
 	def_bool TREE_RCU
@@ -227,7 +235,7 @@ config RCU_NOCB_CPU
 
 config TASKS_TRACE_RCU_READ_MB
 	bool "Tasks Trace RCU readers use memory barriers in user and idle"
-	depends on RCU_EXPERT
+	depends on RCU_EXPERT && TASKS_TRACE_RCU
 	default PREEMPT_RT || NR_CPUS < 8
 	help
 	  Use this option to further reduce the number of IPIs sent
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 4fd64999300f..d7f4bb1c4979 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -49,7 +49,6 @@ config RCU_TORTURE_TEST
 	select SRCU
 	select TASKS_RCU
 	select TASKS_RUDE_RCU
-	select TASKS_TRACE_RCU
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 55d049c39608..7dd3e14ec907 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -737,6 +737,48 @@ static struct rcu_torture_ops busted_srcud_ops = {
 	.name		= "busted_srcud"
 };
 
+/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+		WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+	}
+}
+
+static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+{
+	preempt_disable();
+	return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+{
+	preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+	.ttype		= RCU_TRIVIAL_FLAVOR,
+	.init		= rcu_sync_torture_init,
+	.readlock	= rcu_torture_read_lock_trivial,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_torture_read_unlock_trivial,
+	.readlock_held	= torture_readlock_not_held,
+	.get_gp_seq	= rcu_no_completed,
+	.sync		= synchronize_rcu_trivial,
+	.exp_sync	= synchronize_rcu_trivial,
+	.fqs		= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "trivial"
+};
+
 /*
  * Definitions for RCU-tasks torture testing.
  */
@@ -780,48 +822,6 @@ static struct rcu_torture_ops tasks_ops = {
 	.name		= "tasks"
 };
 
-/*
- * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
- */
-
-static void synchronize_rcu_trivial(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
-		WARN_ON_ONCE(raw_smp_processor_id() != cpu);
-	}
-}
-
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
-{
-	preempt_disable();
-	return 0;
-}
-
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
-{
-	preempt_enable();
-}
-
-static struct rcu_torture_ops trivial_ops = {
-	.ttype		= RCU_TRIVIAL_FLAVOR,
-	.init		= rcu_sync_torture_init,
-	.readlock	= rcu_torture_read_lock_trivial,
-	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock	= rcu_torture_read_unlock_trivial,
-	.readlock_held	= torture_readlock_not_held,
-	.get_gp_seq	= rcu_no_completed,
-	.sync		= synchronize_rcu_trivial,
-	.exp_sync	= synchronize_rcu_trivial,
-	.fqs		= NULL,
-	.stats		= NULL,
-	.irq_capable	= 1,
-	.name		= "trivial"
-};
-
 /*
  * Definitions for rude RCU-tasks torture testing.
  */
@@ -851,6 +851,8 @@ static struct rcu_torture_ops tasks_rude_ops = {
 	.name		= "tasks-rude"
 };
 
+#ifdef CONFIG_TASKS_TRACE_RCU
+
 /*
  * Definitions for tracing RCU-tasks torture testing.
  */
@@ -893,6 +895,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
 	.name		= "tasks-tracing"
 };
 
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
 static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
 {
 	if (!cur_ops->gp_diff)
@@ -3096,9 +3107,9 @@ rcu_torture_init(void)
 	int flags = 0;
 	unsigned long gp_seq = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
-		&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
-		&busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
-		&tasks_tracing_ops, &trivial_ops,
+		&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+		&tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS
+		&trivial_ops,
 	};
 
 	if (!torture_init_begin(torture_type, verbose))
-- 
cgit v1.2.3


From 3b6e1dd42317ec366dab3205f99280e2ab1ad85a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Mar 2022 15:18:27 -0700
Subject: rcutorture: Allow rcutorture without RCU Tasks

Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for
RCU Tasks.  Unless that kernel builds rcutorture, whether built-in or as
a module, in which case RCU Tasks is (unnecessarily) used.  This both
increases kernel size and increases the complexity of certain tracing
operations.  This commit therefore decouples the presence of rcutorture
from the presence of RCU Tasks.

However, there is a need to select CONFIG_TASKS_RCU for testing purposes.
Except that casual users must not be bothered with questions -- for them,
this needs to be fully automated.  There is thus a CONFIG_FORCE_TASKS_RCU
that selects CONFIG_TASKS_RCU, is user-selectable, but which depends
on CONFIG_RCU_EXPERT.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig       | 18 +++++++++++++-----
 kernel/rcu/Kconfig.debug |  1 -
 kernel/rcu/rcutorture.c  | 13 ++++++++++++-
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 2befd328e6a0..8eac165db09f 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -77,13 +77,21 @@ config TASKS_RCU_GENERIC
 	  This option enables generic infrastructure code supporting
 	  task-based RCU implementations.  Not for manual selection.
 
+config FORCE_TASKS_RCU
+	bool "Force selection of TASKS_RCU"
+	depends on RCU_EXPERT
+	select TASKS_RCU
+	default n
+	help
+	  This option force-enables a task-based RCU implementation
+	  that uses only voluntary context switch (not preemption!),
+	  idle, and user-mode execution as quiescent states.  Not for
+	  manual selection in most cases.
+
 config TASKS_RCU
-	def_bool 0
+	bool
+	default n
 	select IRQ_WORK
-	help
-	  This option enables a task-based RCU implementation that uses
-	  only voluntary context switch (not preemption!), idle, and
-	  user-mode execution as quiescent states.  Not for manual selection.
 
 config TASKS_RUDE_RCU
 	def_bool 0
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index d7f4bb1c4979..c217a5e655a4 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -47,7 +47,6 @@ config RCU_TORTURE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RCU
 	select TASKS_RUDE_RCU
 	default n
 	help
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7dd3e14ec907..65d045ff9766 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -779,6 +779,8 @@ static struct rcu_torture_ops trivial_ops = {
 	.name		= "trivial"
 };
 
+#ifdef CONFIG_TASKS_RCU
+
 /*
  * Definitions for RCU-tasks torture testing.
  */
@@ -822,6 +824,15 @@ static struct rcu_torture_ops tasks_ops = {
 	.name		= "tasks"
 };
 
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RCU
+
+
 /*
  * Definitions for rude RCU-tasks torture testing.
  */
@@ -3108,7 +3119,7 @@ rcu_torture_init(void)
 	unsigned long gp_seq = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
-		&tasks_ops, &tasks_rude_ops, TASKS_TRACING_OPS
+		TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS
 		&trivial_ops,
 	};
 
-- 
cgit v1.2.3


From 4c3f7b0e1e880e892d4bc4f50bf627b251b6e2cc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Mar 2022 16:16:45 -0700
Subject: rcutorture: Allow rcutorture without RCU Tasks Rude

Unless a kernel builds rcutorture, whether built-in or as a module, that
kernel is also built with CONFIG_TASKS_RUDE_RCU, whether anything else
needs Tasks Rude RCU or not.  This unnecessarily increases kernel size.
This commit therefore decouples the presence of rcutorture from the
presence of RCU Tasks Rude.

However, there is a need to select CONFIG_TASKS_RUDE_RCU for testing
purposes.  Except that casual users must not be bothered with
questions -- for them, this needs to be fully automated.  There is
thus a CONFIG_FORCE_TASKS_RUDE_RCU that selects CONFIG_TASKS_RUDE_RCU,
is user-selectable, but which depends on CONFIG_RCU_EXPERT.

[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig       | 21 ++++++++++++++-------
 kernel/rcu/Kconfig.debug |  1 -
 kernel/rcu/rcutorture.c  | 13 ++++++++++++-
 3 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 8eac165db09f..65d45c00fd1b 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -93,15 +93,22 @@ config TASKS_RCU
 	default n
 	select IRQ_WORK
 
+config FORCE_TASKS_RUDE_RCU
+	bool "Force selection of Tasks Rude RCU"
+	depends on RCU_EXPERT
+	select TASKS_RUDE_RCU
+	default n
+	help
+	  This option force-enables a task-based RCU implementation
+	  that uses only context switch (including preemption) and
+	  user-mode execution as quiescent states.  It forces IPIs and
+	  context switches on all online CPUs, including idle ones,
+	  so use with caution.	Not for manual selection in most cases.
+
 config TASKS_RUDE_RCU
-	def_bool 0
+	bool
+	default n
 	select IRQ_WORK
-	help
-	  This option enables a task-based RCU implementation that uses
-	  only context switch (including preemption) and user-mode
-	  execution as quiescent states.  It forces IPIs and context
-	  switches on all online CPUs, including idle ones, so use
-	  with caution.
 
 config FORCE_TASKS_TRACE_RCU
 	bool "Force selection of Tasks Trace RCU"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index c217a5e655a4..f4a4468cbf03 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -47,7 +47,6 @@ config RCU_TORTURE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RUDE_RCU
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 65d045ff9766..d528245108c2 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -833,6 +833,8 @@ static struct rcu_torture_ops tasks_ops = {
 #endif // #else #ifdef CONFIG_TASKS_RCU
 
 
+#ifdef CONFIG_TASKS_RUDE_RCU
+
 /*
  * Definitions for rude RCU-tasks torture testing.
  */
@@ -862,6 +864,15 @@ static struct rcu_torture_ops tasks_rude_ops = {
 	.name		= "tasks-rude"
 };
 
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
 #ifdef CONFIG_TASKS_TRACE_RCU
 
 /*
@@ -3119,7 +3130,7 @@ rcu_torture_init(void)
 	unsigned long gp_seq = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
-		TASKS_OPS &tasks_rude_ops, TASKS_TRACING_OPS
+		TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
 		&trivial_ops,
 	};
 
-- 
cgit v1.2.3


From 5f654af150fd5aeb9fff138c7cbd72cea016b863 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 25 Mar 2022 14:39:54 -0700
Subject: refscale: Allow refscale without RCU Tasks

Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for
RCU Tasks.  Unless that kernel builds refscale, whether built-in or as a
module, in which case RCU Tasks is (unnecessarily) built in.  This both
increases kernel size and increases the complexity of certain tracing
operations.  This commit therefore decouples the presence of refscale
from the presence of RCU Tasks.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  1 -
 kernel/rcu/refscale.c    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index f4a4468cbf03..454924e03ef3 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -63,7 +63,6 @@ config RCU_REF_SCALE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RCU
 	select TASKS_RUDE_RCU
 	select TASKS_TRACE_RCU
 	default n
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 5489ff7f478e..5079e47b3d18 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+#ifdef CONFIG_TASKS_RCU
+
 // Definitions for RCU Tasks ref scale testing: Empty read markers.
 // These definitions also work for RCU Rude readers.
 static void rcu_tasks_ref_scale_read_section(const int nloops)
@@ -232,6 +234,14 @@ static struct ref_scale_ops rcu_tasks_ops = {
 	.name		= "rcu-tasks"
 };
 
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
 // Definitions for RCU Tasks Trace ref scale testing.
 static void rcu_trace_ref_scale_read_section(const int nloops)
 {
@@ -790,7 +800,7 @@ ref_scale_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_scale_ops *scale_ops[] = {
-		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
 		&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
 	};
 
-- 
cgit v1.2.3


From dec86781a54f4a527386a0b86b22e99e2ac67a09 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 25 Mar 2022 15:21:07 -0700
Subject: refscale: Allow refscale without RCU Tasks Rude/Trace

Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for
RCU Tasks Rude and RCU Tasks Trace.  Unless that kernel builds refscale,
whether built-in or as a module, in which case these RCU Tasks flavors are
(unnecessarily) built in.  This both increases kernel size and increases
the complexity of certain tracing operations.  This commit therefore
decouples the presence of refscale from the presence of RCU Tasks Rude
and RCU Tasks Trace.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  2 --
 kernel/rcu/refscale.c    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 454924e03ef3..dceaa3e754e5 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -63,8 +63,6 @@ config RCU_REF_SCALE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RUDE_RCU
-	select TASKS_TRACE_RCU
 	default n
 	help
 	  This option provides a kernel module that runs performance tests
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 5079e47b3d18..909644abee67 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -242,6 +242,8 @@ static struct ref_scale_ops rcu_tasks_ops = {
 
 #endif // #else // #ifdef CONFIG_TASKS_RCU
 
+#ifdef CONFIG_TASKS_TRACE_RCU
+
 // Definitions for RCU Tasks Trace ref scale testing.
 static void rcu_trace_ref_scale_read_section(const int nloops)
 {
@@ -271,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = {
 	.name		= "rcu-trace"
 };
 
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
 // Definitions for reference count
 static atomic_t refcnt;
 
@@ -800,7 +810,7 @@ ref_scale_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_scale_ops *scale_ops[] = {
-		&rcu_ops, &srcu_ops, &rcu_trace_ops, RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
+		&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
 		&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
 	};
 
-- 
cgit v1.2.3


From 4df002d908796c1ff87b985af1d31a0e36e6c66f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 25 Mar 2022 16:39:01 -0700
Subject: rcuscale: Allow rcuscale without RCU Tasks

Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for
RCU Tasks.  Unless that kernel builds rcuscale, whether built-in or as
a module, in which case RCU Tasks is (unnecessarily) built.  This both
increases kernel size and increases the complexity of certain tracing
operations.  This commit therefore decouples the presence of rcuscale
from the presence of RCU Tasks.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  1 -
 kernel/rcu/rcuscale.c    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index dceaa3e754e5..71e73fceff87 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -28,7 +28,6 @@ config RCU_SCALE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RCU
 	select TASKS_RUDE_RCU
 	select TASKS_TRACE_RCU
 	default n
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 5e4f1f83d38e..311dbcb064ed 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = {
 	.name		= "srcud"
 };
 
+#ifdef CONFIG_TASKS_RCU
+
 /*
  * Definitions for RCU-tasks scalability testing.
  */
@@ -295,6 +297,14 @@ static struct rcu_scale_ops tasks_ops = {
 	.name		= "tasks"
 };
 
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
 /*
  * Definitions for RCU-tasks-trace scalability testing.
  */
@@ -797,7 +807,7 @@ rcu_scale_init(void)
 	long i;
 	int firsterr = 0;
 	static struct rcu_scale_ops *scale_ops[] = {
-		&rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops
+		&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops
 	};
 
 	if (!torture_init_begin(scale_type, verbose))
-- 
cgit v1.2.3


From 5ce027f4cd0e2f28ea5574ede9eef290e2ede5c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 25 Mar 2022 17:05:40 -0700
Subject: rcuscale: Allow rcuscale without RCU Tasks Rude/Trace

Currently, a CONFIG_PREEMPT_NONE=y kernel substitutes normal RCU for
RCU Tasks Rude and RCU Tasks Trace.  Unless that kernel builds rcuscale,
whether built-in or as a module, in which case these RCU Tasks flavors are
(unnecessarily) built in.  This both increases kernel size and increases
the complexity of certain tracing operations.  This commit therefore
decouples the presence of rcuscale from the presence of RCU Tasks Rude
and RCU Tasks Trace.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  2 --
 kernel/rcu/rcuscale.c    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 71e73fceff87..68092e1db64b 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -28,8 +28,6 @@ config RCU_SCALE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
-	select TASKS_RUDE_RCU
-	select TASKS_TRACE_RCU
 	default n
 	help
 	  This option provides a kernel module that runs performance
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 311dbcb064ed..277a5bfb37d4 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -305,6 +305,8 @@ static struct rcu_scale_ops tasks_ops = {
 
 #endif // #else // #ifdef CONFIG_TASKS_RCU
 
+#ifdef CONFIG_TASKS_TRACE_RCU
+
 /*
  * Definitions for RCU-tasks-trace scalability testing.
  */
@@ -334,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = {
 	.name		= "tasks-tracing"
 };
 
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
 static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
 {
 	if (!cur_ops->gp_diff)
@@ -807,7 +817,7 @@ rcu_scale_init(void)
 	long i;
 	int firsterr = 0;
 	static struct rcu_scale_ops *scale_ops[] = {
-		&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS &tasks_tracing_ops
+		&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS
 	};
 
 	if (!torture_init_begin(scale_type, verbose))
-- 
cgit v1.2.3


From 42ba1308074d9046386d58b56e793604be48ce22 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 15 Apr 2022 21:33:42 +0530
Subject: bpf: Make btf_find_field more generic

Next commit introduces field type 'kptr' whose kind will not be struct,
but pointer, and it will not be limited to one offset, but multiple
ones. Make existing btf_find_struct_field and btf_find_datasec_var
functions amenable to use for finding kptrs in map value, by moving
spin_lock and timer specific checks into their own function.

The alignment, and name are checked before the function is called, so it
is the last point where we can skip field or return an error before the
next loop iteration happens. Size of the field and type is meant to be
checked inside the function.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220415160354.1050687-2-memxor@gmail.com
---
 kernel/bpf/btf.c | 120 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7906b9bf7ff8..0493310d981f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3163,24 +3163,44 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+enum btf_field_type {
+	BTF_FIELD_SPIN_LOCK,
+	BTF_FIELD_TIMER,
+};
+
+struct btf_field_info {
+	u32 off;
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+			   u32 off, int sz, struct btf_field_info *info)
+{
+	if (!__btf_type_is_struct(t))
+		return 0;
+	if (t->size != sz)
+		return 0;
+	if (info->off != -ENOENT)
+		/* only one such field is allowed */
+		return -E2BIG;
+	info->off = off;
+	return 0;
+}
+
 static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
-				 const char *name, int sz, int align)
+				 const char *name, int sz, int align,
+				 enum btf_field_type field_type,
+				 struct btf_field_info *info)
 {
 	const struct btf_member *member;
-	u32 i, off = -ENOENT;
+	u32 i, off;
 
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
-		if (!__btf_type_is_struct(member_type))
-			continue;
-		if (member_type->size != sz)
-			continue;
+
 		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
 			continue;
-		if (off != -ENOENT)
-			/* only one such field is allowed */
-			return -E2BIG;
+
 		off = __btf_member_bit_offset(t, member);
 		if (off % 8)
 			/* valid C code cannot generate such BTF */
@@ -3188,46 +3208,76 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
 		off /= 8;
 		if (off % align)
 			return -EINVAL;
+
+		switch (field_type) {
+		case BTF_FIELD_SPIN_LOCK:
+		case BTF_FIELD_TIMER:
+			return btf_find_struct(btf, member_type, off, sz, info);
+		default:
+			return -EFAULT;
+		}
 	}
-	return off;
+	return 0;
 }
 
 static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
-				const char *name, int sz, int align)
+				const char *name, int sz, int align,
+				enum btf_field_type field_type,
+				struct btf_field_info *info)
 {
 	const struct btf_var_secinfo *vsi;
-	u32 i, off = -ENOENT;
+	u32 i, off;
 
 	for_each_vsi(i, t, vsi) {
 		const struct btf_type *var = btf_type_by_id(btf, vsi->type);
 		const struct btf_type *var_type = btf_type_by_id(btf, var->type);
 
-		if (!__btf_type_is_struct(var_type))
-			continue;
-		if (var_type->size != sz)
+		off = vsi->offset;
+
+		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
 			continue;
 		if (vsi->size != sz)
 			continue;
-		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
-			continue;
-		if (off != -ENOENT)
-			/* only one such field is allowed */
-			return -E2BIG;
-		off = vsi->offset;
 		if (off % align)
 			return -EINVAL;
+
+		switch (field_type) {
+		case BTF_FIELD_SPIN_LOCK:
+		case BTF_FIELD_TIMER:
+			return btf_find_struct(btf, var_type, off, sz, info);
+		default:
+			return -EFAULT;
+		}
 	}
-	return off;
+	return 0;
 }
 
 static int btf_find_field(const struct btf *btf, const struct btf_type *t,
-			  const char *name, int sz, int align)
+			  enum btf_field_type field_type,
+			  struct btf_field_info *info)
 {
+	const char *name;
+	int sz, align;
+
+	switch (field_type) {
+	case BTF_FIELD_SPIN_LOCK:
+		name = "bpf_spin_lock";
+		sz = sizeof(struct bpf_spin_lock);
+		align = __alignof__(struct bpf_spin_lock);
+		break;
+	case BTF_FIELD_TIMER:
+		name = "bpf_timer";
+		sz = sizeof(struct bpf_timer);
+		align = __alignof__(struct bpf_timer);
+		break;
+	default:
+		return -EFAULT;
+	}
 
 	if (__btf_type_is_struct(t))
-		return btf_find_struct_field(btf, t, name, sz, align);
+		return btf_find_struct_field(btf, t, name, sz, align, field_type, info);
 	else if (btf_type_is_datasec(t))
-		return btf_find_datasec_var(btf, t, name, sz, align);
+		return btf_find_datasec_var(btf, t, name, sz, align, field_type, info);
 	return -EINVAL;
 }
 
@@ -3237,16 +3287,24 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
  */
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
 {
-	return btf_find_field(btf, t, "bpf_spin_lock",
-			      sizeof(struct bpf_spin_lock),
-			      __alignof__(struct bpf_spin_lock));
+	struct btf_field_info info = { .off = -ENOENT };
+	int ret;
+
+	ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info);
+	if (ret < 0)
+		return ret;
+	return info.off;
 }
 
 int btf_find_timer(const struct btf *btf, const struct btf_type *t)
 {
-	return btf_find_field(btf, t, "bpf_timer",
-			      sizeof(struct bpf_timer),
-			      __alignof__(struct bpf_timer));
+	struct btf_field_info info = { .off = -ENOENT };
+	int ret;
+
+	ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info);
+	if (ret < 0)
+		return ret;
+	return info.off;
 }
 
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
-- 
cgit v1.2.3


From e9147b4422e1f35b9c229c980c596ccf03d61562 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 15 Apr 2022 21:33:43 +0530
Subject: bpf: Move check_ptr_off_reg before check_map_access

Some functions in next patch want to use this function, and those
functions will be called by check_map_access, hence move it before
check_map_access.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/bpf/20220415160354.1050687-3-memxor@gmail.com
---
 kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c1a02b82ecd..71827d14724a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3469,6 +3469,44 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg, int regno,
+			       bool fixed_off_ok)
+{
+	/* Access to this pointer-typed register or passing it to a helper
+	 * is only allowed in its original, unmodified form.
+	 */
+
+	if (reg->off < 0) {
+		verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+			reg_type_str(env, reg->type), regno, reg->off);
+		return -EACCES;
+	}
+
+	if (!fixed_off_ok && reg->off) {
+		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+			reg_type_str(env, reg->type), regno, reg->off);
+		return -EACCES;
+	}
+
+	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "variable %s access var_off=%s disallowed\n",
+			reg_type_str(env, reg->type), tn_buf);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+		      const struct bpf_reg_state *reg, int regno)
+{
+	return __check_ptr_off_reg(env, reg, regno, false);
+}
+
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			    int off, int size, bool zero_size_allowed)
@@ -3980,44 +4018,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 }
 #endif
 
-static int __check_ptr_off_reg(struct bpf_verifier_env *env,
-			       const struct bpf_reg_state *reg, int regno,
-			       bool fixed_off_ok)
-{
-	/* Access to this pointer-typed register or passing it to a helper
-	 * is only allowed in its original, unmodified form.
-	 */
-
-	if (reg->off < 0) {
-		verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
-			reg_type_str(env, reg->type), regno, reg->off);
-		return -EACCES;
-	}
-
-	if (!fixed_off_ok && reg->off) {
-		verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
-			reg_type_str(env, reg->type), regno, reg->off);
-		return -EACCES;
-	}
-
-	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
-		char tn_buf[48];
-
-		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "variable %s access var_off=%s disallowed\n",
-			reg_type_str(env, reg->type), tn_buf);
-		return -EACCES;
-	}
-
-	return 0;
-}
-
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno)
-{
-	return __check_ptr_off_reg(env, reg, regno, false);
-}
-
 static int __check_buffer_access(struct bpf_verifier_env *env,
 				 const char *buf_info,
 				 const struct bpf_reg_state *reg,
-- 
cgit v1.2.3


From f8b7d2b4c192118c37ab24c0540d1134dd0104d8 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 15 Apr 2022 14:29:57 -0700
Subject: ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One can enable dyanmic tracing but disable sysctls.
When this is doen we get the compile kernel warning:

  CC      kernel/trace/ftrace.o
kernel/trace/ftrace.c:3086:13: warning: ‘ftrace_shutdown_sysctl’ defined
but not used [-Wunused-function]
 3086 | static void ftrace_shutdown_sysctl(void)
      |             ^~~~~~~~~~~~~~~~~~~~~~
kernel/trace/ftrace.c:3068:13: warning: ‘ftrace_startup_sysctl’ defined
but not used [-Wunused-function]
 3068 | static void ftrace_startup_sysctl(void)

When CONFIG_DYNAMIC_FTRACE=n the ftrace_startup_sysctl() and
routines ftrace_shutdown_sysctl() still compiles, so these
are actually more just used for when SYSCTL=y.

Fix this then by just moving these routines to when sysctls
are enabled.

Fixes: 7cde53da38a3 ("ftrace: move sysctl_ftrace_enabled to ftrace.c")
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/trace/ftrace.c | 71 ++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index db8d553728b6..d9424fd9a183 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3065,40 +3065,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	return 0;
 }
 
-static void ftrace_startup_sysctl(void)
-{
-	int command;
-
-	if (unlikely(ftrace_disabled))
-		return;
-
-	/* Force update next time */
-	saved_ftrace_func = NULL;
-	/* ftrace_start_up is true if we want ftrace running */
-	if (ftrace_start_up) {
-		command = FTRACE_UPDATE_CALLS;
-		if (ftrace_graph_active)
-			command |= FTRACE_START_FUNC_RET;
-		ftrace_startup_enable(command);
-	}
-}
-
-static void ftrace_shutdown_sysctl(void)
-{
-	int command;
-
-	if (unlikely(ftrace_disabled))
-		return;
-
-	/* ftrace_start_up is true if ftrace is running */
-	if (ftrace_start_up) {
-		command = FTRACE_DISABLE_CALLS;
-		if (ftrace_graph_active)
-			command |= FTRACE_STOP_FUNC_RET;
-		ftrace_run_update_code(command);
-	}
-}
-
 static u64		ftrace_update_time;
 unsigned long		ftrace_update_tot_cnt;
 unsigned long		ftrace_number_of_pages;
@@ -7267,9 +7233,6 @@ core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_all(int command) { }
 
-# define ftrace_startup_sysctl()	do { } while (0)
-# define ftrace_shutdown_sysctl()	do { } while (0)
-
 static void ftrace_update_trampoline(struct ftrace_ops *ops)
 {
 }
@@ -7910,6 +7873,40 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 
 #ifdef CONFIG_SYSCTL
+static void ftrace_startup_sysctl(void)
+{
+	int command;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	/* Force update next time */
+	saved_ftrace_func = NULL;
+	/* ftrace_start_up is true if we want ftrace running */
+	if (ftrace_start_up) {
+		command = FTRACE_UPDATE_CALLS;
+		if (ftrace_graph_active)
+			command |= FTRACE_START_FUNC_RET;
+		ftrace_startup_enable(command);
+	}
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+	int command;
+
+	if (unlikely(ftrace_disabled))
+		return;
+
+	/* ftrace_start_up is true if ftrace is running */
+	if (ftrace_start_up) {
+		command = FTRACE_DISABLE_CALLS;
+		if (ftrace_graph_active)
+			command |= FTRACE_STOP_FUNC_RET;
+		ftrace_run_update_code(command);
+	}
+}
+
 static bool is_permanent_ops_registered(void)
 {
 	struct ftrace_ops *op;
-- 
cgit v1.2.3


From 988f11e046401f8561c4afefa506a50f0203de40 Mon Sep 17 00:00:00 2001
From: liaohua <liaohua4@huawei.com>
Date: Thu, 7 Apr 2022 15:29:48 +0800
Subject: latencytop: move sysctl to its own file

This moves latencytop sysctl to kernel/latencytop.c

Signed-off-by: liaohua <liaohua4@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/latencytop.c | 41 +++++++++++++++++++++++++++++------------
 kernel/sysctl.c     | 10 ----------
 2 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 166d7bf49666..76166df011a4 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,7 @@
 #include <linux/sched/stat.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
+#include <linux/sysctl.h>
 
 static DEFINE_RAW_SPINLOCK(latency_lock);
 
@@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR];
 
 int latencytop_enabled;
 
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (latencytop_enabled)
+		force_schedstat_enabled();
+
+	return err;
+}
+
+static struct ctl_table latencytop_sysctl[] = {
+	{
+		.procname   = "latencytop",
+		.data       = &latencytop_enabled,
+		.maxlen     = sizeof(int),
+		.mode       = 0644,
+		.proc_handler   = sysctl_latencytop,
+	},
+	{}
+};
+#endif
+
 void clear_tsk_latency_tracing(struct task_struct *p)
 {
 	unsigned long flags;
@@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = {
 static int __init init_lstats_procfs(void)
 {
 	proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("kernel", latencytop_sysctl);
+#endif
 	return 0;
 }
-
-int sysctl_latencytop(struct ctl_table *table, int write, void *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	int err;
-
-	err = proc_dointvec(table, write, buffer, lenp, ppos);
-	if (latencytop_enabled)
-		force_schedstat_enabled();
-
-	return err;
-}
 device_initcall(init_lstats_procfs);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0fdf465a93d..b60345cbadf0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,7 +64,6 @@
 #include <linux/kexec.h>
 #include <linux/mount.h>
 #include <linux/userfaultfd_k.h>
-#include <linux/latencytop.h>
 #include <linux/pid.h>
 
 #include "../lib/kstrtox.h"
@@ -1623,15 +1622,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= SYSCTL_NEG_ONE,
 		.extra2		= SYSCTL_ONE,
 	},
-#endif
-#ifdef CONFIG_LATENCYTOP
-	{
-		.procname	= "latencytop",
-		.data		= &latencytop_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_latencytop,
-	},
 #endif
 	{
 		.procname	= "print-fatal-signals",
-- 
cgit v1.2.3


From 8fd7c2144d1292f15c901211750dee021ed5079a Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Thu, 21 Apr 2022 11:38:43 -0700
Subject: ftrace: fix building with SYSCTL=y but DYNAMIC_FTRACE=n

Ok so hopefully this is the last of it. 0day picked up a build
failure [0] when SYSCTL=y but DYNAMIC_FTRACE=n. This can be fixed
by just declaring an empty routine for the calls moved just
recently.

[0] https://lkml.kernel.org/r/202204161203.6dSlgKJX-lkp@intel.com

Reported-by: kernel test robot <lkp@intel.com>
Fixes: f8b7d2b4c192 ("ftrace: fix building with SYSCTL=n but DYNAMIC_FTRACE=y")
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/trace/ftrace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d9424fd9a183..1f89039f0feb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7873,6 +7873,8 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 
 #ifdef CONFIG_SYSCTL
+
+#ifdef CONFIG_DYNAMIC_FTRACE
 static void ftrace_startup_sysctl(void)
 {
 	int command;
@@ -7906,6 +7908,10 @@ static void ftrace_shutdown_sysctl(void)
 		ftrace_run_update_code(command);
 	}
 }
+#else
+# define ftrace_startup_sysctl()       do { } while (0)
+# define ftrace_shutdown_sysctl()      do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
 
 static bool is_permanent_ops_registered(void)
 {
-- 
cgit v1.2.3


From 10a5a651e3afc9b0b381f47e8930972e4e918397 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Thu, 31 Mar 2022 13:57:17 +0800
Subject: workqueue: Restrict kworker in the offline CPU pool running on
 housekeeping CPUs

When a CPU is going offline, all workers on the CPU's pool will have their
cpus_allowed cleared to cpu_possible_mask and can run on any CPUs including
the isolated ones. Instead, set cpus_allowed to wq_unbound_cpumask so that
the can avoid isolated CPUs.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0d2514b4ff0d..4056f2a3f9d5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5001,7 +5001,7 @@ static void unbind_workers(int cpu)
 
 		for_each_pool_worker(worker, pool) {
 			kthread_set_per_cpu(worker->task, -1);
-			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+			WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
 		}
 
 		mutex_unlock(&wq_pool_attach_mutex);
-- 
cgit v1.2.3


From ecc04463d1a36f88baa750d45dfb02c364e1fdb1 Mon Sep 17 00:00:00 2001
From: Aleksandr Nogikh <nogikh@google.com>
Date: Thu, 21 Apr 2022 16:36:07 -0700
Subject: kcov: don't generate a warning on vm_insert_page()'s failure

vm_insert_page()'s failure is not an unexpected condition, so don't do
WARN_ONCE() in such a case.

Instead, print a kernel message and just return an error code.

This flaw has been reported under an OOM condition by sysbot [1].

The message is mainly for the benefit of the test log, in this case the
fuzzer's log so that humans inspecting the log can figure out what was
going on.  KCOV is a testing tool, so I think being a little more chatty
when KCOV unexpectedly is about to fail will save someone debugging
time.

We don't want the WARN, because it's not a kernel bug that syzbot should
report, and failure can happen if the fuzzer tries hard enough (as
above).

Link: https://lkml.kernel.org/r/Ylkr2xrVbhQYwNLf@elver.google.com [1]
Link: https://lkml.kernel.org/r/20220401182512.249282-1-nogikh@google.com
Fixes: b3d7fe86fbd0 ("kcov: properly handle subsequent mmap calls"),
Signed-off-by: Aleksandr Nogikh <nogikh@google.com>
Acked-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Taras Madan <tarasmadan@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 475524bd900a..b3732b210593 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -475,8 +475,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_DONTEXPAND;
 	for (off = 0; off < size; off += PAGE_SIZE) {
 		page = vmalloc_to_page(kcov->area + off);
-		if (vm_insert_page(vma, vma->vm_start + off, page))
-			WARN_ONCE(1, "vm_insert_page() failed");
+		res = vm_insert_page(vma, vma->vm_start + off, page);
+		if (res) {
+			pr_warn_once("kcov: vm_insert_page() failed\n");
+			return res;
+		}
 	}
 	return 0;
 exit:
-- 
cgit v1.2.3


From 78ed93d72ded679e3caf0758357209887bda885f Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 4 Apr 2022 13:12:04 +0200
Subject: signal: Deliver SIGTRAP on perf event asynchronously if blocked

With SIGTRAP on perf events, we have encountered termination of
processes due to user space attempting to block delivery of SIGTRAP.
Consider this case:

    <set up SIGTRAP on a perf event>
    ...
    sigset_t s;
    sigemptyset(&s);
    sigaddset(&s, SIGTRAP | <and others>);
    sigprocmask(SIG_BLOCK, &s, ...);
    ...
    <perf event triggers>

When the perf event triggers, while SIGTRAP is blocked, force_sig_perf()
will force the signal, but revert back to the default handler, thus
terminating the task.

This makes sense for error conditions, but not so much for explicitly
requested monitoring. However, the expectation is still that signals
generated by perf events are synchronous, which will no longer be the
case if the signal is blocked and delivered later.

To give user space the ability to clearly distinguish synchronous from
asynchronous signals, introduce siginfo_t::si_perf_flags and
TRAP_PERF_FLAG_ASYNC (opted for flags in case more binary information is
required in future).

The resolution to the problem is then to (a) no longer force the signal
(avoiding the terminations), but (b) tell user space via si_perf_flags
if the signal was synchronous or not, so that such signals can be
handled differently (e.g. let user space decide to ignore or consider
the data imprecise).

The alternative of making the kernel ignore SIGTRAP on perf events if
the signal is blocked may work for some usecases, but likely causes
issues in others that then have to revert back to interception of
sigprocmask() (which we want to avoid). [ A concrete example: when using
breakpoint perf events to track data-flow, in a region of code where
signals are blocked, data-flow can no longer be tracked accurately.
When a relevant asynchronous signal is received after unblocking the
signal, the data-flow tracking logic needs to know its state is
imprecise. ]

Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Link: https://lore.kernel.org/r/20220404111204.935357-1-elver@google.com
---
 kernel/events/core.c |  4 ++--
 kernel/signal.c      | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index cfde994ce61c..6eafb1b0ad4a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6533,8 +6533,8 @@ static void perf_sigtrap(struct perf_event *event)
 	if (current->flags & PF_EXITING)
 		return;
 
-	force_sig_perf((void __user *)event->pending_addr,
-		       event->attr.type, event->attr.sig_data);
+	send_sig_perf((void __user *)event->pending_addr,
+		      event->attr.type, event->attr.sig_data);
 }
 
 static void perf_pending_event_disable(struct perf_event *event)
diff --git a/kernel/signal.c b/kernel/signal.c
index 30cd1ca43bcd..e43bc2a692f5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1805,7 +1805,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
 }
 #endif
 
-int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
+int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
 {
 	struct kernel_siginfo info;
 
@@ -1817,7 +1817,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
 	info.si_perf_data = sig_data;
 	info.si_perf_type = type;
 
-	return force_sig_info(&info);
+	/*
+	 * Signals generated by perf events should not terminate the whole
+	 * process if SIGTRAP is blocked, however, delivering the signal
+	 * asynchronously is better than not delivering at all. But tell user
+	 * space if the signal was asynchronous, so it can clearly be
+	 * distinguished from normal synchronous ones.
+	 */
+	info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
+				     TRAP_PERF_FLAG_ASYNC :
+				     0;
+
+	return send_sig_info(info.si_signo, &info, current);
 }
 
 /**
@@ -3432,6 +3443,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
 		to->si_addr = ptr_to_compat(from->si_addr);
 		to->si_perf_data = from->si_perf_data;
 		to->si_perf_type = from->si_perf_type;
+		to->si_perf_flags = from->si_perf_flags;
 		break;
 	case SIL_CHLD:
 		to->si_pid = from->si_pid;
@@ -3509,6 +3521,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
 		to->si_addr = compat_ptr(from->si_addr);
 		to->si_perf_data = from->si_perf_data;
 		to->si_perf_type = from->si_perf_type;
+		to->si_perf_flags = from->si_perf_flags;
 		break;
 	case SIL_CHLD:
 		to->si_pid    = from->si_pid;
@@ -4722,6 +4735,7 @@ static inline void siginfo_buildtime_checks(void)
 	CHECK_OFFSET(si_pkey);
 	CHECK_OFFSET(si_perf_data);
 	CHECK_OFFSET(si_perf_type);
+	CHECK_OFFSET(si_perf_flags);
 
 	/* sigpoll */
 	CHECK_OFFSET(si_band);
-- 
cgit v1.2.3


From 915a087e4c47334a2f7ba2a4092c4bade0873769 Mon Sep 17 00:00:00 2001
From: Hailong Liu <liuhailong@linux.alibaba.com>
Date: Fri, 1 Apr 2022 13:10:11 +0800
Subject: psi: Fix trigger being fired unexpectedly at initial

When a trigger being created, its win.start_value and win.start_time are
reset to zero. If group->total[PSI_POLL][t->state] has accumulated before,
this trigger will be fired unexpectedly in the next period, even if its
growth time does not reach its threshold.

So set the window of the new trigger to the current state value.

Signed-off-by: Hailong Liu <liuhailong@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Link: https://lore.kernel.org/r/1648789811-3788971-1-git-send-email-liuhailong@linux.alibaba.com
---
 kernel/sched/psi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a4fa3aadfcba..5a49a8c8783e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1117,7 +1117,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	t->state = state;
 	t->threshold = threshold_us * NSEC_PER_USEC;
 	t->win.size = window_us * NSEC_PER_USEC;
-	window_reset(&t->win, 0, 0, 0);
+	window_reset(&t->win, sched_clock(),
+			group->total[PSI_POLL][t->state], 0);
 
 	t->event = 0;
 	t->last_event_time = 0;
-- 
cgit v1.2.3


From 06354900787f25bf5be3c07a68e3cdbc5bf0fa69 Mon Sep 17 00:00:00 2001
From: zgpeng <zgpeng.linux@gmail.com>
Date: Wed, 6 Apr 2022 17:57:05 +0800
Subject: sched/fair: Move calculate of avg_load to a better location

In calculate_imbalance function, when the value of local->avg_load is
greater than or equal to busiest->avg_load, the calculated sds->avg_load is
not used. So this calculation can be placed in a more appropriate position.

Signed-off-by: zgpeng <zgpeng@tencent.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Samuel Liao <samuelliao@tencent.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/1649239025-10010-1-git-send-email-zgpeng@tencent.com
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bd299d67ab..601f8bdc39b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9460,8 +9460,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
 				  local->group_capacity;
 
-		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
-				sds->total_capacity;
 		/*
 		 * If the local group is more loaded than the selected
 		 * busiest group don't try to pull any tasks.
@@ -9470,6 +9468,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 			env->imbalance = 0;
 			return;
 		}
+
+		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+				sds->total_capacity;
 	}
 
 	/*
-- 
cgit v1.2.3


From 64eaf50731ac0a8c76ce2fedd50ef6652aabc5ff Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 8 Apr 2022 19:53:08 +0800
Subject: sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq

Since commit 23127296889f ("sched/fair: Update scale invariance of PELT")
change to use rq_clock_pelt() instead of rq_clock_task(), we should also
use rq_clock_pelt() for throttled_clock_task_time and throttled_clock_task
accounting to get correct cfs_rq_clock_pelt() of throttled cfs_rq. And
rename throttled_clock_task(_time) to be clock_pelt rather than clock_task.

Fixes: 23127296889f ("sched/fair: Update scale invariance of PELT")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20220408115309.81603-1-zhouchengming@bytedance.com
---
 kernel/sched/fair.c  | 8 ++++----
 kernel/sched/pelt.h  | 4 ++--
 kernel/sched/sched.h | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 601f8bdc39b9..f74b34080c9a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4846,8 +4846,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 
 	cfs_rq->throttle_count--;
 	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
-					     cfs_rq->throttled_clock_task;
+		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+					     cfs_rq->throttled_clock_pelt;
 
 		/* Add cfs_rq with load or one or more already running entities to the list */
 		if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
@@ -4864,7 +4864,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 
 	/* group is entering throttled state, stop time */
 	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 		list_del_leaf_cfs_rq(cfs_rq);
 	}
 	cfs_rq->throttle_count++;
@@ -5308,7 +5308,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
 	pcfs_rq = tg->parent->cfs_rq[cpu];
 
 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
-	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index c336f5f481bc..4ff2ed4f8fa1 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 {
 	if (unlikely(cfs_rq->throttle_count))
-		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+		return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
 
-	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
 }
 #else
 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58263f90c559..762be73972bd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -603,8 +603,8 @@ struct cfs_rq {
 	s64			runtime_remaining;
 
 	u64			throttled_clock;
-	u64			throttled_clock_task;
-	u64			throttled_clock_task_time;
+	u64			throttled_clock_pelt;
+	u64			throttled_clock_pelt_time;
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
-- 
cgit v1.2.3


From 0a00a354644ee1800d31c47cf5927b9b50272fac Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 8 Apr 2022 19:53:09 +0800
Subject: sched/fair: Delete useless condition in tg_unthrottle_up()

We have tested cfs_rq->load.weight in cfs_rq_is_decayed(),
the first condition "!cfs_rq_is_decayed(cfs_rq)" is enough
to cover the second condition "cfs_rq->nr_running".

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20220408115309.81603-2-zhouchengming@bytedance.com
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f74b34080c9a..3eba0dcc4962 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4850,7 +4850,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 					     cfs_rq->throttled_clock_pelt;
 
 		/* Add cfs_rq with load or one or more already running entities to the list */
-		if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+		if (!cfs_rq_is_decayed(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
 
-- 
cgit v1.2.3


From 890d550d7dbac7a31ecaa78732aa22be282bb6b8 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 8 Apr 2022 20:19:14 +0800
Subject: sched/psi: report zeroes for CPU full at the system level

Martin find it confusing when look at the /proc/pressure/cpu output,
and found no hint about that CPU "full" line in psi Documentation.

% cat /proc/pressure/cpu
some avg10=0.92 avg60=0.91 avg300=0.73 total=933490489
full avg10=0.22 avg60=0.23 avg300=0.16 total=358783277

The PSI_CPU_FULL state is introduced by commit e7fcd7622823
("psi: Add PSI_CPU_FULL state"), which mainly for cgroup level,
but also counted at the system level as a side effect.

Naturally, the FULL state doesn't exist for the CPU resource at
the system level. These "full" numbers can come from CPU idle
schedule latency. For example, t1 is the time when task wakeup
on an idle CPU, t2 is the time when CPU pick and switch to it.
The delta of (t2 - t1) will be in CPU_FULL state.

Another case all processes can be stalled is when all cgroups
have been throttled at the same time, which unlikely to happen.

Anyway, CPU_FULL metric is meaningless and confusing at the
system level. So this patch will report zeroes for CPU full
at the system level, and update psi Documentation accordingly.

Fixes: e7fcd7622823 ("psi: Add PSI_CPU_FULL state")
Reported-by: Martin Steigerwald <Martin.Steigerwald@proact.de>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220408121914.82855-1-zhouchengming@bytedance.com
---
 kernel/sched/psi.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 5a49a8c8783e..a337f3e35997 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	mutex_unlock(&group->avgs_lock);
 
 	for (full = 0; full < 2; full++) {
-		unsigned long avg[3];
-		u64 total;
+		unsigned long avg[3] = { 0, };
+		u64 total = 0;
 		int w;
 
-		for (w = 0; w < 3; w++)
-			avg[w] = group->avg[res * 2 + full][w];
-		total = div_u64(group->total[PSI_AVGS][res * 2 + full],
-				NSEC_PER_USEC);
+		/* CPU FULL is undefined at the system level */
+		if (!(group == &psi_system && res == PSI_CPU && full)) {
+			for (w = 0; w < 3; w++)
+				avg[w] = group->avg[res * 2 + full][w];
+			total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+					NSEC_PER_USEC);
+		}
 
 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
 			   full ? "full" : "some",
-- 
cgit v1.2.3


From a658353167bf2ea6052cee071dbcc13e0f229dc9 Mon Sep 17 00:00:00 2001
From: Tao Zhou <tao.zhou@linux.dev>
Date: Fri, 15 Apr 2022 17:55:04 +0800
Subject: sched/fair: Revise comment about lb decision matrix

If busiest group type is group_misfit_task, the local
group type must be group_has_spare according to below
code in update_sd_pick_busiest():

  if (sgs->group_type == group_misfit_task &&
      (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
       sds->local_stat.group_type != group_has_spare))
	   return false;

group type imbalanced and overloaded and fully_busy are filtered in here.
misfit and asym are filtered before in update_sg_lb_stats().
So, change the decision matrix to:

  busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
  has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
  fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
  misfit_task      force     N/A        N/A    N/A  *N/A*      *N/A*
  asym_packing     force     force      N/A    N/A  force      force
  imbalanced       force     force      N/A    N/A  force      force
  overloaded       force     force      N/A    N/A  force      avg_load

Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()")
Signed-off-by: Tao Zhou <tao.zhou@linux.dev>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20220415095505.7765-1-tao.zhou@linux.dev
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3eba0dcc4962..4c420124b5d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9496,7 +9496,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
  * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
  * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
- * misfit_task      force     N/A        N/A    N/A  force      force
+ * misfit_task      force     N/A        N/A    N/A  N/A        N/A
  * asym_packing     force     force      N/A    N/A  force      force
  * imbalanced       force     force      N/A    N/A  force      force
  * overloaded       force     force      N/A    N/A  force      avg_load
-- 
cgit v1.2.3


From 03f16cd020eb8bb2eb837e2090086f296a9fa91d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 18 Apr 2022 09:50:36 -0700
Subject: objtool: Add CONFIG_OBJTOOL

Now that stack validation is an optional feature of objtool, add
CONFIG_OBJTOOL and replace most usages of CONFIG_STACK_VALIDATION with
it.

CONFIG_STACK_VALIDATION can now be considered to be frame-pointer
specific.  CONFIG_UNWINDER_ORC is already inherently valid for live
patching, so no need to "validate" it.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Link: https://lkml.kernel.org/r/939bf3d85604b2a126412bf11af6e3bd3b872bcb.1650300597.git.jpoimboe@redhat.com
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2c43e327a619..2956bc277150 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -728,6 +728,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL
 	depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
 	depends on !FTRACE_MCOUNT_USE_CC
 	depends on FTRACE_MCOUNT_RECORD
+	select OBJTOOL
 
 config FTRACE_MCOUNT_USE_RECORDMCOUNT
 	def_bool y
-- 
cgit v1.2.3


From 9e4ab6c89109472082616f8d2f6ada7deaffe161 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 19 Apr 2022 12:22:19 +0100
Subject: arm64/sme: Implement vector length configuration prctl()s

As for SVE provide a prctl() interface which allows processes to
configure their SME vector length.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20220419112247.711548-12-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/sys.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 374f83e95239..b911fa6d81ab 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,6 +117,12 @@
 #ifndef SVE_GET_VL
 # define SVE_GET_VL()		(-EINVAL)
 #endif
+#ifndef SME_SET_VL
+# define SME_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SME_GET_VL
+# define SME_GET_VL()		(-EINVAL)
+#endif
 #ifndef PAC_RESET_KEYS
 # define PAC_RESET_KEYS(a, b)	(-EINVAL)
 #endif
@@ -2541,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
+	case PR_SME_SET_VL:
+		error = SME_SET_VL(arg2);
+		break;
+	case PR_SME_GET_VL:
+		error = SME_GET_VL();
+		break;
 	case PR_GET_SPECULATION_CTRL:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-- 
cgit v1.2.3


From faebd693c59387b7b765fab64b543855e15a91b4 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:36 +0206
Subject: printk: rename cpulock functions

Since the printk cpulock is CPU-reentrant and since it is used
in all contexts, its usage must be carefully considered and
most likely will require programming locklessly. To avoid
mistaking the printk cpulock as a typical lock, rename it to
cpu_sync. The main functions then become:

    printk_cpu_sync_get_irqsave(flags);
    printk_cpu_sync_put_irqrestore(flags);

Add extra notes of caution in the function description to help
developers understand the requirements for correct usage.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-2-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 71 +++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index da03c15ecc89..13a1eebe72af 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3667,26 +3667,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
 
 #ifdef CONFIG_SMP
-static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
-static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
 
 /**
- * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
- *                               spinning lock is not owned by any CPU.
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ *                            spinning lock is not owned by any CPU.
  *
  * Context: Any context.
  */
-void __printk_wait_on_cpu_lock(void)
+void __printk_cpu_sync_wait(void)
 {
 	do {
 		cpu_relax();
-	} while (atomic_read(&printk_cpulock_owner) != -1);
+	} while (atomic_read(&printk_cpu_sync_owner) != -1);
 }
-EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
 
 /**
- * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
- *                          spinning lock.
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ *                               spinning lock.
  *
  * If no processor has the lock, the calling processor takes the lock and
  * becomes the owner. If the calling processor is already the owner of the
@@ -3695,7 +3695,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
  * Context: Any context. Expects interrupts to be disabled.
  * Return: 1 on success, otherwise 0.
  */
-int __printk_cpu_trylock(void)
+int __printk_cpu_sync_try_get(void)
 {
 	int cpu;
 	int old;
@@ -3705,79 +3705,80 @@ int __printk_cpu_trylock(void)
 	/*
 	 * Guarantee loads and stores from this CPU when it is the lock owner
 	 * are _not_ visible to the previous lock owner. This pairs with
-	 * __printk_cpu_unlock:B.
+	 * __printk_cpu_sync_put:B.
 	 *
 	 * Memory barrier involvement:
 	 *
-	 * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then
-	 * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B.
+	 * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+	 * then __printk_cpu_sync_put:A can never read from
+	 * __printk_cpu_sync_try_get:B.
 	 *
 	 * Relies on:
 	 *
-	 * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+	 * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
 	 * of the previous CPU
 	 *    matching
-	 * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
-	 * of this CPU
+	 * ACQUIRE from __printk_cpu_sync_try_get:A to
+	 * __printk_cpu_sync_try_get:B of this CPU
 	 */
-	old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1,
-				     cpu); /* LMM(__printk_cpu_trylock:A) */
+	old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+				     cpu); /* LMM(__printk_cpu_sync_try_get:A) */
 	if (old == -1) {
 		/*
 		 * This CPU is now the owner and begins loading/storing
-		 * data: LMM(__printk_cpu_trylock:B)
+		 * data: LMM(__printk_cpu_sync_try_get:B)
 		 */
 		return 1;
 
 	} else if (old == cpu) {
 		/* This CPU is already the owner. */
-		atomic_inc(&printk_cpulock_nested);
+		atomic_inc(&printk_cpu_sync_nested);
 		return 1;
 	}
 
 	return 0;
 }
-EXPORT_SYMBOL(__printk_cpu_trylock);
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
 
 /**
- * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
  *
  * The calling processor must be the owner of the lock.
  *
  * Context: Any context. Expects interrupts to be disabled.
  */
-void __printk_cpu_unlock(void)
+void __printk_cpu_sync_put(void)
 {
-	if (atomic_read(&printk_cpulock_nested)) {
-		atomic_dec(&printk_cpulock_nested);
+	if (atomic_read(&printk_cpu_sync_nested)) {
+		atomic_dec(&printk_cpu_sync_nested);
 		return;
 	}
 
 	/*
 	 * This CPU is finished loading/storing data:
-	 * LMM(__printk_cpu_unlock:A)
+	 * LMM(__printk_cpu_sync_put:A)
 	 */
 
 	/*
 	 * Guarantee loads and stores from this CPU when it was the
 	 * lock owner are visible to the next lock owner. This pairs
-	 * with __printk_cpu_trylock:A.
+	 * with __printk_cpu_sync_try_get:A.
 	 *
 	 * Memory barrier involvement:
 	 *
-	 * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B,
-	 * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A.
+	 * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+	 * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
 	 *
 	 * Relies on:
 	 *
-	 * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
+	 * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
 	 * of this CPU
 	 *    matching
-	 * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
-	 * of the next CPU
+	 * ACQUIRE from __printk_cpu_sync_try_get:A to
+	 * __printk_cpu_sync_try_get:B of the next CPU
 	 */
-	atomic_set_release(&printk_cpulock_owner,
-			   -1); /* LMM(__printk_cpu_unlock:B) */
+	atomic_set_release(&printk_cpu_sync_owner,
+			   -1); /* LMM(__printk_cpu_sync_put:B) */
 }
-EXPORT_SYMBOL(__printk_cpu_unlock);
+EXPORT_SYMBOL(__printk_cpu_sync_put);
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 1f5d783094cf28b4905f51cad846eb5d1db6673e Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:38 +0206
Subject: printk: add missing memory barrier to wake_up_klogd()

It is important that any new records are visible to preparing
waiters before the waker checks if the wait queue is empty.
Otherwise it is possible that:

- there are new records available
- the waker sees an empty wait queue and does not wake
- the preparing waiter sees no new records and begins to wait

This is exactly the problem that the function description of
waitqueue_active() warns about.

Use wq_has_sleeper() instead of waitqueue_active() because it
includes the necessary full memory barrier.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-4-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13a1eebe72af..f817dfb4852d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -746,8 +746,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			goto out;
 		}
 
+		/*
+		 * Guarantee this task is visible on the waitqueue before
+		 * checking the wake condition.
+		 *
+		 * The full memory barrier within set_current_state() of
+		 * prepare_to_wait_event() pairs with the full memory barrier
+		 * within wq_has_sleeper().
+		 *
+		 * This pairs with wake_up_klogd:A.
+		 */
 		ret = wait_event_interruptible(log_wait,
-				prb_read_valid(prb, atomic64_read(&user->seq), r));
+				prb_read_valid(prb,
+					atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */
 		if (ret)
 			goto out;
 	}
@@ -1513,7 +1524,18 @@ static int syslog_print(char __user *buf, int size)
 		seq = syslog_seq;
 
 		mutex_unlock(&syslog_lock);
-		len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL));
+		/*
+		 * Guarantee this task is visible on the waitqueue before
+		 * checking the wake condition.
+		 *
+		 * The full memory barrier within set_current_state() of
+		 * prepare_to_wait_event() pairs with the full memory barrier
+		 * within wq_has_sleeper().
+		 *
+		 * This pairs with wake_up_klogd:A.
+		 */
+		len = wait_event_interruptible(log_wait,
+				prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
 		mutex_lock(&syslog_lock);
 
 		if (len)
@@ -3316,7 +3338,18 @@ void wake_up_klogd(void)
 		return;
 
 	preempt_disable();
-	if (waitqueue_active(&log_wait)) {
+	/*
+	 * Guarantee any new records can be seen by tasks preparing to wait
+	 * before this context checks if the wait queue is empty.
+	 *
+	 * The full memory barrier within wq_has_sleeper() pairs with the full
+	 * memory barrier within set_current_state() of
+	 * prepare_to_wait_event(), which is called after ___wait_event() adds
+	 * the waiter but before it has checked the wait condition.
+	 *
+	 * This pairs with devkmsg_read:A and syslog_print:A.
+	 */
+	if (wq_has_sleeper(&log_wait)) { /* LMM(wake_up_klogd:A) */
 		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
-- 
cgit v1.2.3


From 938ba4084abcf6fdd21d9078513c52f8fb9b00d0 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:39 +0206
Subject: printk: wake up all waiters

There can be multiple tasks waiting for new records. They should
all be woken. Use wake_up_interruptible_all() instead of
wake_up_interruptible().

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-5-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f817dfb4852d..e23357002648 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3326,7 +3326,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
-		wake_up_interruptible(&log_wait);
+		wake_up_interruptible_all(&log_wait);
 }
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
-- 
cgit v1.2.3


From 5341b93dea8c39d7612f7a227015d4b1d5cf30db Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:40 +0206
Subject: printk: wake waiters for safe and NMI contexts

When printk() is called from safe or NMI contexts, it will directly
store the record (vprintk_store()) and then defer the console output.
However, defer_console_output() only causes console printing and does
not wake any waiters of new records.

Wake waiters from defer_console_output() so that they also are aware
of the new records from safe and NMI contexts.

Fixes: 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when accessing the main log buffer in NMI")
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-6-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e23357002648..7bb148a1debb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -754,7 +754,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		 * prepare_to_wait_event() pairs with the full memory barrier
 		 * within wq_has_sleeper().
 		 *
-		 * This pairs with wake_up_klogd:A.
+		 * This pairs with __wake_up_klogd:A.
 		 */
 		ret = wait_event_interruptible(log_wait,
 				prb_read_valid(prb,
@@ -1532,7 +1532,7 @@ static int syslog_print(char __user *buf, int size)
 		 * prepare_to_wait_event() pairs with the full memory barrier
 		 * within wq_has_sleeper().
 		 *
-		 * This pairs with wake_up_klogd:A.
+		 * This pairs with __wake_up_klogd:A.
 		 */
 		len = wait_event_interruptible(log_wait,
 				prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
@@ -3332,7 +3332,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
 	IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
 
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
 {
 	if (!printk_percpu_data_ready())
 		return;
@@ -3349,22 +3349,26 @@ void wake_up_klogd(void)
 	 *
 	 * This pairs with devkmsg_read:A and syslog_print:A.
 	 */
-	if (wq_has_sleeper(&log_wait)) { /* LMM(wake_up_klogd:A) */
-		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+	if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+	    (val & PRINTK_PENDING_OUTPUT)) {
+		this_cpu_or(printk_pending, val);
 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
 	preempt_enable();
 }
 
-void defer_console_output(void)
+void wake_up_klogd(void)
 {
-	if (!printk_percpu_data_ready())
-		return;
+	__wake_up_klogd(PRINTK_PENDING_WAKEUP);
+}
 
-	preempt_disable();
-	this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
-	irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
-	preempt_enable();
+void defer_console_output(void)
+{
+	/*
+	 * New messages may have been added directly to the ringbuffer
+	 * using vprintk_store(), so wake any waiters as well.
+	 */
+	__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
 }
 
 void printk_trigger_flush(void)
-- 
cgit v1.2.3


From 9f0844de49cf0557d5c359131004acbb179c174e Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:41 +0206
Subject: printk: get caller_id/timestamp after migration disable

Currently the local CPU timestamp and caller_id for the record are
collected while migration is enabled. Since this information is
CPU-specific, it should be collected with migration disabled.

Migration is disabled immediately after collecting this information
anyway, so just move the information collection to after the
migration disabling.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-7-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7bb148a1debb..82ad3d3d0d4a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2063,7 +2063,7 @@ static inline void printk_delay(void)
 static inline u32 printk_caller_id(void)
 {
 	return in_task() ? task_pid_nr(current) :
-		0x80000000 + raw_smp_processor_id();
+		0x80000000 + smp_processor_id();
 }
 
 /**
@@ -2145,7 +2145,6 @@ int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
 		  const char *fmt, va_list args)
 {
-	const u32 caller_id = printk_caller_id();
 	struct prb_reserved_entry e;
 	enum printk_info_flags flags = 0;
 	struct printk_record r;
@@ -2155,10 +2154,14 @@ int vprintk_store(int facility, int level,
 	u8 *recursion_ptr;
 	u16 reserve_size;
 	va_list args2;
+	u32 caller_id;
 	u16 text_len;
 	int ret = 0;
 	u64 ts_nsec;
 
+	if (!printk_enter_irqsave(recursion_ptr, irqflags))
+		return 0;
+
 	/*
 	 * Since the duration of printk() can vary depending on the message
 	 * and state of the ringbuffer, grab the timestamp now so that it is
@@ -2167,8 +2170,7 @@ int vprintk_store(int facility, int level,
 	 */
 	ts_nsec = local_clock();
 
-	if (!printk_enter_irqsave(recursion_ptr, irqflags))
-		return 0;
+	caller_id = printk_caller_id();
 
 	/*
 	 * The sprintf needs to come first since the syslog prefix might be
-- 
cgit v1.2.3


From 1f47e8af45fdd754c5538f8aeb7857a6c087fd6a Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:42 +0206
Subject: printk: call boot_delay_msec() in printk_delay()

boot_delay_msec() is always called immediately before printk_delay()
so just call it from within printk_delay().

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-8-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 82ad3d3d0d4a..2f99e0b383b9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2048,8 +2048,10 @@ static u8 *__printk_recursion_counter(void)
 
 int printk_delay_msec __read_mostly;
 
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
 {
+	boot_delay_msec(level);
+
 	if (unlikely(printk_delay_msec)) {
 		int m = printk_delay_msec;
 
@@ -2274,8 +2276,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		in_sched = true;
 	}
 
-	boot_delay_msec(level);
-	printk_delay();
+	printk_delay(level);
 
 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
 
-- 
cgit v1.2.3


From 1fc0ca9e0db61882208650b3603071e9f4b5cfee Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:43 +0206
Subject: printk: add con_printk() macro for console details

It is useful to generate log messages that include details about
the related console. Rather than duplicate the code to assemble
the details, put that code into a macro con_printk().

Once console printers become threaded, this macro will find more
users.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-9-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2f99e0b383b9..e36d3ed41afa 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3015,6 +3015,11 @@ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
+#define con_printk(lvl, con, fmt, ...)			\
+	printk(lvl pr_fmt("%sconsole [%s%d] " fmt),	\
+	       (con->flags & CON_BOOT) ? "boot" : "",	\
+	       con->name, con->index, ##__VA_ARGS__)
+
 /*
  * The console driver calls this routine during kernel initialization
  * to register the console printing procedure with printk() and to
@@ -3153,9 +3158,7 @@ void register_console(struct console *newcon)
 	 * users know there might be something in the kernel's log buffer that
 	 * went to the bootconsole (that they do not see on the real console)
 	 */
-	pr_info("%sconsole [%s%d] enabled\n",
-		(newcon->flags & CON_BOOT) ? "boot" : "" ,
-		newcon->name, newcon->index);
+	con_printk(KERN_INFO, newcon, "enabled\n");
 	if (bootcon_enabled &&
 	    ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
 	    !keep_bootcon) {
@@ -3174,9 +3177,7 @@ int unregister_console(struct console *console)
 	struct console *con;
 	int res;
 
-	pr_info("%sconsole [%s%d] disabled\n",
-		(console->flags & CON_BOOT) ? "boot" : "" ,
-		console->name, console->index);
+	con_printk(KERN_INFO, console, "disabled\n");
 
 	res = _braille_unregister_console(console);
 	if (res < 0)
-- 
cgit v1.2.3


From a699449bb13b70b8bd10dc03ad7327ea3993221e Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:44 +0206
Subject: printk: refactor and rework printing logic

Refactor/rework printing logic in order to prepare for moving to
threaded console printing.

- Move @console_seq into struct console so that the current
  "position" of each console can be tracked individually.

- Move @console_dropped into struct console so that the current drop
  count of each console can be tracked individually.

- Modify printing logic so that each console independently loads,
  prepares, and prints its next record.

- Remove exclusive_console logic. Since console positions are
  handled independently, replaying past records occurs naturally.

- Update the comments explaining why preemption is disabled while
  printing from printk() context.

With these changes, there is a change in behavior: the console
replaying the log (formerly exclusive console) will no longer block
other consoles. New messages appear on the other consoles while the
newly added console is still replaying.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-10-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 441 +++++++++++++++++++++++++------------------------
 1 file changed, 228 insertions(+), 213 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e36d3ed41afa..3dea8bbaf402 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -280,11 +280,6 @@ static bool panic_in_progress(void)
  */
 static int console_locked, console_suspended;
 
-/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
 /*
  *	Array of consoles built from command line options (console=)
  */
@@ -374,12 +369,6 @@ static u64 syslog_seq;
 static size_t syslog_partial;
 static bool syslog_time;
 
-/* All 3 protected by @console_sem. */
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
-
 struct latched_seq {
 	seqcount_latch_t	latch;
 	u64			val[2];
@@ -1933,47 +1922,26 @@ static int console_trylock_spinning(void)
 }
 
 /*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Call the specified console driver, asking it to write out the specified
+ * text and length. For non-extended consoles, if any records have been
+ * dropped, a dropped message will be written out first.
  */
-static void call_console_drivers(const char *ext_text, size_t ext_len,
-				 const char *text, size_t len)
+static void call_console_driver(struct console *con, const char *text, size_t len)
 {
 	static char dropped_text[64];
-	size_t dropped_len = 0;
-	struct console *con;
+	size_t dropped_len;
 
 	trace_console_rcuidle(text, len);
 
-	if (!console_drivers)
-		return;
-
-	if (console_dropped) {
+	if (con->dropped && !(con->flags & CON_EXTENDED)) {
 		dropped_len = snprintf(dropped_text, sizeof(dropped_text),
 				       "** %lu printk messages dropped **\n",
-				       console_dropped);
-		console_dropped = 0;
+				       con->dropped);
+		con->dropped = 0;
+		con->write(con, dropped_text, dropped_len);
 	}
 
-	for_each_console(con) {
-		if (exclusive_console && con != exclusive_console)
-			continue;
-		if (!(con->flags & CON_ENABLED))
-			continue;
-		if (!con->write)
-			continue;
-		if (!cpu_online(smp_processor_id()) &&
-		    !(con->flags & CON_ANYTIME))
-			continue;
-		if (con->flags & CON_EXTENDED)
-			con->write(con, ext_text, ext_len);
-		else {
-			if (dropped_len)
-				con->write(con, dropped_text, dropped_len);
-			con->write(con, text, len);
-		}
-	}
+	con->write(con, text, len);
 }
 
 /*
@@ -2283,15 +2251,18 @@ asmlinkage int vprintk_emit(int facility, int level,
 	/* If called from the scheduler, we can not call up(). */
 	if (!in_sched) {
 		/*
-		 * Disable preemption to avoid being preempted while holding
-		 * console_sem which would prevent anyone from printing to
-		 * console
+		 * The caller may be holding system-critical or
+		 * timing-sensitive locks. Disable preemption during
+		 * printing of all remaining records to all consoles so that
+		 * this context can return as soon as possible. Hopefully
+		 * another printk() caller will take over the printing.
 		 */
 		preempt_disable();
 		/*
 		 * Try to acquire and then immediately release the console
-		 * semaphore.  The release will print out buffers and wake up
-		 * /dev/kmsg and syslog() users.
+		 * semaphore. The release will print out buffers. With the
+		 * spinning variant, this context tries to take over the
+		 * printing from another printing context.
 		 */
 		if (console_trylock_spinning())
 			console_unlock();
@@ -2329,11 +2300,9 @@ EXPORT_SYMBOL(_printk);
 
 #define prb_read_valid(rb, seq, r)	false
 #define prb_first_valid_seq(rb)		0
+#define prb_next_seq(rb)		0
 
 static u64 syslog_seq;
-static u64 console_seq;
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
 
 static size_t record_print_text(const struct printk_record *r,
 				bool syslog, bool time)
@@ -2350,8 +2319,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 				  struct dev_printk_info *dev_info) { return 0; }
 static void console_lock_spinning_enable(void) { }
 static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
-				 const char *text, size_t len) {}
+static void call_console_driver(struct console *con, const char *text, size_t len) { }
 static bool suppress_message_printing(int level) { return false; }
 
 #endif /* CONFIG_PRINTK */
@@ -2621,22 +2589,6 @@ int is_console_locked(void)
 }
 EXPORT_SYMBOL(is_console_locked);
 
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
-	struct console *con;
-
-	for_each_console(con)
-		if ((con->flags & CON_ENABLED) &&
-				(con->flags & CON_ANYTIME))
-			return 1;
-
-	return 0;
-}
-
 /*
  * Return true when this CPU should unlock console_sem without pushing all
  * messages to the console. This reduces the chance that the console is
@@ -2657,15 +2609,182 @@ static bool abandon_console_lock_in_panic(void)
 }
 
 /*
- * Can we actually use the console at this time on this cpu?
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires the console_lock.
+ */
+static inline bool console_is_usable(struct console *con)
+{
+	if (!(con->flags & CON_ENABLED))
+		return false;
+
+	if (!con->write)
+		return false;
+
+	/*
+	 * Console drivers may assume that per-cpu resources have been
+	 * allocated. So unless they're explicitly marked as being able to
+	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+	 */
+	if (!cpu_online(raw_smp_processor_id()) &&
+	    !(con->flags & CON_ANYTIME))
+		return false;
+
+	return true;
+}
+
+static void __console_unlock(void)
+{
+	console_locked = 0;
+	up_console_sem();
+}
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
  *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * Requires the console_lock.
  */
-static inline int can_use_console(void)
+static bool console_emit_next_record(struct console *con, bool *handover)
 {
-	return cpu_online(raw_smp_processor_id()) || have_callable_console();
+	static char ext_text[CONSOLE_EXT_LOG_MAX];
+	static char text[CONSOLE_LOG_MAX];
+	static int panic_console_dropped;
+	struct printk_info info;
+	struct printk_record r;
+	unsigned long flags;
+	char *write_text;
+	size_t len;
+
+	prb_rec_init_rd(&r, &info, text, sizeof(text));
+
+	*handover = false;
+
+	if (!prb_read_valid(prb, con->seq, &r))
+		return false;
+
+	if (con->seq != r.info->seq) {
+		con->dropped += r.info->seq - con->seq;
+		con->seq = r.info->seq;
+		if (panic_in_progress() && panic_console_dropped++ > 10) {
+			suppress_panic_printk = 1;
+			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+		}
+	}
+
+	/* Skip record that has level above the console loglevel. */
+	if (suppress_message_printing(r.info->level)) {
+		con->seq++;
+		goto skip;
+	}
+
+	if (con->flags & CON_EXTENDED) {
+		write_text = &ext_text[0];
+		len = info_print_ext_header(ext_text, sizeof(ext_text), r.info);
+		len += msg_print_ext_body(ext_text + len, sizeof(ext_text) - len,
+					  &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+	} else {
+		write_text = &text[0];
+		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+	}
+
+	/*
+	 * While actively printing out messages, if another printk()
+	 * were to occur on another CPU, it may wait for this one to
+	 * finish. This task can not be preempted if there is a
+	 * waiter waiting to take over.
+	 *
+	 * Interrupts are disabled because the hand over to a waiter
+	 * must not be interrupted until the hand over is completed
+	 * (@console_waiter is cleared).
+	 */
+	printk_safe_enter_irqsave(flags);
+	console_lock_spinning_enable();
+
+	stop_critical_timings();	/* don't trace print latency */
+	call_console_driver(con, write_text, len);
+	start_critical_timings();
+
+	con->seq++;
+
+	*handover = console_lock_spinning_disable_and_check();
+	printk_safe_exit_irqrestore(flags);
+skip:
+	return true;
+}
+
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+	bool any_usable = false;
+	struct console *con;
+	bool any_progress;
+
+	*next_seq = 0;
+	*handover = false;
+
+	do {
+		any_progress = false;
+
+		for_each_console(con) {
+			bool progress;
+
+			if (!console_is_usable(con))
+				continue;
+			any_usable = true;
+
+			progress = console_emit_next_record(con, handover);
+			if (*handover)
+				return false;
+
+			/* Track the next of the highest seq flushed. */
+			if (con->seq > *next_seq)
+				*next_seq = con->seq;
+
+			if (!progress)
+				continue;
+			any_progress = true;
+
+			/* Allow panic_cpu to take over the consoles safely. */
+			if (abandon_console_lock_in_panic())
+				return false;
+
+			if (do_cond_resched)
+				cond_resched();
+		}
+	} while (any_progress);
+
+	return any_usable;
 }
 
 /**
@@ -2678,28 +2797,20 @@ static inline int can_use_console(void)
  * by printk().  If this is the case, console_unlock(); emits
  * the output prior to releasing the lock.
  *
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
- *
  * console_unlock(); may be called from any context.
  */
 void console_unlock(void)
 {
-	static char ext_text[CONSOLE_EXT_LOG_MAX];
-	static char text[CONSOLE_LOG_MAX];
-	static int panic_console_dropped;
-	unsigned long flags;
-	bool do_cond_resched, retry;
-	struct printk_info info;
-	struct printk_record r;
-	u64 __maybe_unused next_seq;
+	bool do_cond_resched;
+	bool handover;
+	bool flushed;
+	u64 next_seq;
 
 	if (console_suspended) {
 		up_console_sem();
 		return;
 	}
 
-	prb_rec_init_rd(&r, &info, text, sizeof(text));
-
 	/*
 	 * Console drivers are called with interrupts disabled, so
 	 * @console_may_schedule should be cleared before; however, we may
@@ -2708,125 +2819,34 @@ void console_unlock(void)
 	 * between lines if allowable.  Not doing so can cause a very long
 	 * scheduling stall on a slow console leading to RCU stall and
 	 * softlockup warnings which exacerbate the issue with more
-	 * messages practically incapacitating the system.
-	 *
-	 * console_trylock() is not able to detect the preemptive
-	 * context reliably. Therefore the value must be stored before
-	 * and cleared after the "again" goto label.
+	 * messages practically incapacitating the system. Therefore, create
+	 * a local to use for the printing loop.
 	 */
 	do_cond_resched = console_may_schedule;
-again:
-	console_may_schedule = 0;
-
-	/*
-	 * We released the console_sem lock, so we need to recheck if
-	 * cpu is online and (if not) is there at least one CON_ANYTIME
-	 * console.
-	 */
-	if (!can_use_console()) {
-		console_locked = 0;
-		up_console_sem();
-		return;
-	}
 
-	for (;;) {
-		size_t ext_len = 0;
-		int handover;
-		size_t len;
-
-skip:
-		if (!prb_read_valid(prb, console_seq, &r))
-			break;
-
-		if (console_seq != r.info->seq) {
-			console_dropped += r.info->seq - console_seq;
-			console_seq = r.info->seq;
-			if (panic_in_progress() && panic_console_dropped++ > 10) {
-				suppress_panic_printk = 1;
-				pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
-			}
-		}
-
-		if (suppress_message_printing(r.info->level)) {
-			/*
-			 * Skip record we have buffered and already printed
-			 * directly to the console when we received it, and
-			 * record that has level above the console loglevel.
-			 */
-			console_seq++;
-			goto skip;
-		}
+	do {
+		console_may_schedule = 0;
 
-		/* Output to all consoles once old messages replayed. */
-		if (unlikely(exclusive_console &&
-			     console_seq >= exclusive_console_stop_seq)) {
-			exclusive_console = NULL;
-		}
+		flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+		if (!handover)
+			__console_unlock();
 
 		/*
-		 * Handle extended console text first because later
-		 * record_print_text() will modify the record buffer in-place.
+		 * Abort if there was a failure to flush all messages to all
+		 * usable consoles. Either it is not possible to flush (in
+		 * which case it would be an infinite loop of retrying) or
+		 * another context has taken over printing.
 		 */
-		if (nr_ext_console_drivers) {
-			ext_len = info_print_ext_header(ext_text,
-						sizeof(ext_text),
-						r.info);
-			ext_len += msg_print_ext_body(ext_text + ext_len,
-						sizeof(ext_text) - ext_len,
-						&r.text_buf[0],
-						r.info->text_len,
-						&r.info->dev_info);
-		}
-		len = record_print_text(&r,
-				console_msg_format & MSG_FORMAT_SYSLOG,
-				printk_time);
-		console_seq++;
+		if (!flushed)
+			break;
 
 		/*
-		 * While actively printing out messages, if another printk()
-		 * were to occur on another CPU, it may wait for this one to
-		 * finish. This task can not be preempted if there is a
-		 * waiter waiting to take over.
-		 *
-		 * Interrupts are disabled because the hand over to a waiter
-		 * must not be interrupted until the hand over is completed
-		 * (@console_waiter is cleared).
+		 * Some context may have added new records after
+		 * console_flush_all() but before unlocking the console.
+		 * Re-check if there is a new record to flush. If the trylock
+		 * fails, another context is already handling the printing.
 		 */
-		printk_safe_enter_irqsave(flags);
-		console_lock_spinning_enable();
-
-		stop_critical_timings();	/* don't trace print latency */
-		call_console_drivers(ext_text, ext_len, text, len);
-		start_critical_timings();
-
-		handover = console_lock_spinning_disable_and_check();
-		printk_safe_exit_irqrestore(flags);
-		if (handover)
-			return;
-
-		/* Allow panic_cpu to take over the consoles safely */
-		if (abandon_console_lock_in_panic())
-			break;
-
-		if (do_cond_resched)
-			cond_resched();
-	}
-
-	/* Get consistent value of the next-to-be-used sequence number. */
-	next_seq = console_seq;
-
-	console_locked = 0;
-	up_console_sem();
-
-	/*
-	 * Someone could have filled up the buffer again, so re-check if there's
-	 * something to flush. In case we cannot trylock the console_sem again,
-	 * there's a new owner and the console_unlock() from them will do the
-	 * flush, no worries.
-	 */
-	retry = prb_read_valid(prb, next_seq, NULL);
-	if (retry && !abandon_console_lock_in_panic() && console_trylock())
-		goto again;
+	} while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
 }
 EXPORT_SYMBOL(console_unlock);
 
@@ -2886,8 +2906,14 @@ void console_flush_on_panic(enum con_flush_mode mode)
 	console_trylock();
 	console_may_schedule = 0;
 
-	if (mode == CONSOLE_REPLAY_ALL)
-		console_seq = prb_first_valid_seq(prb);
+	if (mode == CONSOLE_REPLAY_ALL) {
+		struct console *c;
+		u64 seq;
+
+		seq = prb_first_valid_seq(prb);
+		for_each_console(c)
+			c->seq = seq;
+	}
 	console_unlock();
 }
 
@@ -3127,26 +3153,15 @@ void register_console(struct console *newcon)
 	if (newcon->flags & CON_EXTENDED)
 		nr_ext_console_drivers++;
 
+	newcon->dropped = 0;
 	if (newcon->flags & CON_PRINTBUFFER) {
-		/*
-		 * console_unlock(); will print out the buffered messages
-		 * for us.
-		 *
-		 * We're about to replay the log buffer.  Only do this to the
-		 * just-registered console to avoid excessive message spam to
-		 * the already-registered consoles.
-		 *
-		 * Set exclusive_console with disabled interrupts to reduce
-		 * race window with eventual console_flush_on_panic() that
-		 * ignores console_lock.
-		 */
-		exclusive_console = newcon;
-		exclusive_console_stop_seq = console_seq;
-
 		/* Get a consistent copy of @syslog_seq. */
 		mutex_lock(&syslog_lock);
-		console_seq = syslog_seq;
+		newcon->seq = syslog_seq;
 		mutex_unlock(&syslog_lock);
+	} else {
+		/* Begin with next message. */
+		newcon->seq = prb_next_seq(prb);
 	}
 	console_unlock();
 	console_sysfs_notify();
-- 
cgit v1.2.3


From 03a749e628fdbc665d7f9712637f880a79da8b78 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:45 +0206
Subject: printk: move buffer definitions into console_emit_next_record()
 caller

Extended consoles print extended messages and do not print messages about
dropped records.

Non-extended consoles print "normal" messages as well as extra messages
about dropped records.

Currently the buffers for these various message types are defined within
the functions that might use them and their usage is based upon the
CON_EXTENDED flag. This will be a problem when moving to kthread printers
because each printer must be able to provide its own buffers.

Move all the message buffer definitions outside of
console_emit_next_record(). The caller knows if extended or dropped
messages should be printed and can specify the appropriate buffers to
use. The console_emit_next_record() and call_console_driver() functions
can know what to print based on whether specified buffers are non-NULL.

With this change, buffer definition/allocation/specification is separated
from the code that does the various types of string printing.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-11-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 60 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3dea8bbaf402..dec5355c5b5b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -394,6 +394,9 @@ static struct latched_seq clear_seq = {
 /* the maximum size of a formatted record (i.e. with prefix added per line) */
 #define CONSOLE_LOG_MAX		1024
 
+/* the maximum size for a dropped text message */
+#define DROPPED_TEXT_MAX	64
+
 /* the maximum size allowed to be reserved for a record */
 #define LOG_LINE_MAX		(CONSOLE_LOG_MAX - PREFIX_MAX)
 
@@ -1923,18 +1926,18 @@ static int console_trylock_spinning(void)
 
 /*
  * Call the specified console driver, asking it to write out the specified
- * text and length. For non-extended consoles, if any records have been
+ * text and length. If @dropped_text is non-NULL and any records have been
  * dropped, a dropped message will be written out first.
  */
-static void call_console_driver(struct console *con, const char *text, size_t len)
+static void call_console_driver(struct console *con, const char *text, size_t len,
+				char *dropped_text)
 {
-	static char dropped_text[64];
 	size_t dropped_len;
 
 	trace_console_rcuidle(text, len);
 
-	if (con->dropped && !(con->flags & CON_EXTENDED)) {
-		dropped_len = snprintf(dropped_text, sizeof(dropped_text),
+	if (con->dropped && dropped_text) {
+		dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
 				       "** %lu printk messages dropped **\n",
 				       con->dropped);
 		con->dropped = 0;
@@ -2296,6 +2299,7 @@ EXPORT_SYMBOL(_printk);
 #else /* CONFIG_PRINTK */
 
 #define CONSOLE_LOG_MAX		0
+#define DROPPED_TEXT_MAX	0
 #define printk_time		false
 
 #define prb_read_valid(rb, seq, r)	false
@@ -2319,7 +2323,10 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 				  struct dev_printk_info *dev_info) { return 0; }
 static void console_lock_spinning_enable(void) { }
 static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_driver(struct console *con, const char *text, size_t len) { }
+static void call_console_driver(struct console *con, const char *text, size_t len,
+				char *dropped_text)
+{
+}
 static bool suppress_message_printing(int level) { return false; }
 
 #endif /* CONFIG_PRINTK */
@@ -2644,6 +2651,14 @@ static void __console_unlock(void)
  * Print one record for the given console. The record printed is whatever
  * record is the next available record for the given console.
  *
+ * @text is a buffer of size CONSOLE_LOG_MAX.
+ *
+ * If extended messages should be printed, @ext_text is a buffer of size
+ * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL.
+ *
+ * If dropped messages should be printed, @dropped_text is a buffer of size
+ * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
+ *
  * @handover will be set to true if a printk waiter has taken over the
  * console_lock, in which case the caller is no longer holding the
  * console_lock. Otherwise it is set to false.
@@ -2653,10 +2668,9 @@ static void __console_unlock(void)
  *
  * Requires the console_lock.
  */
-static bool console_emit_next_record(struct console *con, bool *handover)
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+				     char *dropped_text, bool *handover)
 {
-	static char ext_text[CONSOLE_EXT_LOG_MAX];
-	static char text[CONSOLE_LOG_MAX];
 	static int panic_console_dropped;
 	struct printk_info info;
 	struct printk_record r;
@@ -2664,7 +2678,7 @@ static bool console_emit_next_record(struct console *con, bool *handover)
 	char *write_text;
 	size_t len;
 
-	prb_rec_init_rd(&r, &info, text, sizeof(text));
+	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
 
 	*handover = false;
 
@@ -2686,13 +2700,13 @@ static bool console_emit_next_record(struct console *con, bool *handover)
 		goto skip;
 	}
 
-	if (con->flags & CON_EXTENDED) {
-		write_text = &ext_text[0];
-		len = info_print_ext_header(ext_text, sizeof(ext_text), r.info);
-		len += msg_print_ext_body(ext_text + len, sizeof(ext_text) - len,
+	if (ext_text) {
+		write_text = ext_text;
+		len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info);
+		len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len,
 					  &r.text_buf[0], r.info->text_len, &r.info->dev_info);
 	} else {
-		write_text = &text[0];
+		write_text = text;
 		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
 	}
 
@@ -2710,7 +2724,7 @@ static bool console_emit_next_record(struct console *con, bool *handover)
 	console_lock_spinning_enable();
 
 	stop_critical_timings();	/* don't trace print latency */
-	call_console_driver(con, write_text, len);
+	call_console_driver(con, write_text, len, dropped_text);
 	start_critical_timings();
 
 	con->seq++;
@@ -2746,6 +2760,9 @@ skip:
  */
 static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
 {
+	static char dropped_text[DROPPED_TEXT_MAX];
+	static char ext_text[CONSOLE_EXT_LOG_MAX];
+	static char text[CONSOLE_LOG_MAX];
 	bool any_usable = false;
 	struct console *con;
 	bool any_progress;
@@ -2763,7 +2780,16 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 				continue;
 			any_usable = true;
 
-			progress = console_emit_next_record(con, handover);
+			if (con->flags & CON_EXTENDED) {
+				/* Extended consoles do not print "dropped messages". */
+				progress = console_emit_next_record(con, &text[0],
+								    &ext_text[0], NULL,
+								    handover);
+			} else {
+				progress = console_emit_next_record(con, &text[0],
+								    NULL, &dropped_text[0],
+								    handover);
+			}
 			if (*handover)
 				return false;
 
-- 
cgit v1.2.3


From 3b604ca81202eea2a917eb6491e90f610fba0ec7 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:46 +0206
Subject: printk: add pr_flush()

Provide a might-sleep function to allow waiting for console printers
to catch up to the latest logged message.

Use pr_flush() whenever it is desirable to get buffered messages
printed before continuing: suspend_console(), resume_console(),
console_stop(), console_start(), console_unblank().

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-12-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index dec5355c5b5b..a06999d55278 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2296,6 +2296,8 @@ asmlinkage __visible int _printk(const char *fmt, ...)
 }
 EXPORT_SYMBOL(_printk);
 
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
+
 #else /* CONFIG_PRINTK */
 
 #define CONSOLE_LOG_MAX		0
@@ -2328,6 +2330,7 @@ static void call_console_driver(struct console *con, const char *text, size_t le
 {
 }
 static bool suppress_message_printing(int level) { return false; }
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
 
 #endif /* CONFIG_PRINTK */
 
@@ -2515,6 +2518,7 @@ void suspend_console(void)
 	if (!console_suspend_enabled)
 		return;
 	pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
+	pr_flush(1000, true);
 	console_lock();
 	console_suspended = 1;
 	up_console_sem();
@@ -2527,6 +2531,7 @@ void resume_console(void)
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
+	pr_flush(1000, true);
 }
 
 /**
@@ -2912,6 +2917,9 @@ void console_unblank(void)
 		if ((c->flags & CON_ENABLED) && c->unblank)
 			c->unblank();
 	console_unlock();
+
+	if (!oops_in_progress)
+		pr_flush(1000, true);
 }
 
 /**
@@ -2970,6 +2978,7 @@ struct tty_driver *console_device(int *index)
  */
 void console_stop(struct console *console)
 {
+	__pr_flush(console, 1000, true);
 	console_lock();
 	console->flags &= ~CON_ENABLED;
 	console_unlock();
@@ -2981,6 +2990,7 @@ void console_start(struct console *console)
 	console_lock();
 	console->flags |= CON_ENABLED;
 	console_unlock();
+	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
 
@@ -3352,6 +3362,79 @@ static int __init printk_late_init(void)
 late_initcall(printk_late_init);
 
 #if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+	int remaining = timeout_ms;
+	struct console *c;
+	u64 last_diff = 0;
+	u64 printk_seq;
+	u64 diff;
+	u64 seq;
+
+	might_sleep();
+
+	seq = prb_next_seq(prb);
+
+	for (;;) {
+		diff = 0;
+
+		console_lock();
+		for_each_console(c) {
+			if (con && con != c)
+				continue;
+			if (!console_is_usable(c))
+				continue;
+			printk_seq = c->seq;
+			if (printk_seq < seq)
+				diff += seq - printk_seq;
+		}
+		console_unlock();
+
+		if (diff != last_diff && reset_on_progress)
+			remaining = timeout_ms;
+
+		if (diff == 0 || remaining == 0)
+			break;
+
+		if (remaining < 0) {
+			/* no timeout limit */
+			msleep(100);
+		} else if (remaining < 100) {
+			msleep(remaining);
+			remaining = 0;
+		} else {
+			msleep(100);
+			remaining -= 100;
+		}
+
+		last_diff = diff;
+	}
+
+	return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms:        The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all enabled printers are caught up.
+ */
+bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+	return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+EXPORT_SYMBOL(pr_flush);
+
 /*
  * Delayed printk version, for scheduler-internal messages:
  */
-- 
cgit v1.2.3


From 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:47 +0206
Subject: printk: add functions to prefer direct printing

Once kthread printing is available, console printing will no longer
occur in the context of the printk caller. However, there are some
special contexts where it is desirable for the printk caller to
directly print out kernel messages. Using pr_flush() to wait for
threaded printers is only possible if the caller is in a sleepable
context and the kthreads are active. That is not always the case.

Introduce printk_prefer_direct_enter() and printk_prefer_direct_exit()
functions to explicitly (and globally) activate/deactivate preferred
direct console printing. The term "direct console printing" refers to
printing to all enabled consoles from the context of the printk
caller. The term "prefer" is used because this type of printing is
only best effort. If the console is currently locked or other
printers are already actively printing, the printk caller will need
to rely on the other contexts to handle the printing.

This preferred direct printing is how all printing has been handled
until now (unless it was explicitly deferred).

When kthread printing is introduced, there may be some unanticipated
problems due to kthreads being unable to flush important messages.
In order to minimize such risks, preferred direct printing is
activated for the primary important messages when the system
experiences general types of major errors. These are:

 - emergency reboot/shutdown
 - cpu and rcu stalls
 - hard and soft lockups
 - hung tasks
 - warn
 - sysrq

Note that since kthread printing does not yet exist, no behavior
changes result from this commit. This is only implementing the
counter and marking the various places where preferred direct
printing is active.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org> # for RCU
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-13-john.ogness@linutronix.de
---
 kernel/hung_task.c      | 11 ++++++++++-
 kernel/panic.c          |  4 ++++
 kernel/printk/printk.c  | 28 ++++++++++++++++++++++++++++
 kernel/rcu/tree_stall.h |  2 ++
 kernel/reboot.c         | 14 +++++++++++++-
 kernel/watchdog.c       |  4 ++++
 kernel/watchdog_hld.c   |  4 ++++
 7 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 52501e5f7655..02a65d554340 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	 * complain:
 	 */
 	if (sysctl_hung_task_warnings) {
+		printk_prefer_direct_enter();
+
 		if (sysctl_hung_task_warnings > 0)
 			sysctl_hung_task_warnings--;
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 
 		if (sysctl_hung_task_all_cpu_backtrace)
 			hung_task_show_all_bt = true;
+
+		printk_prefer_direct_exit();
 	}
 
 	touch_nmi_watchdog();
@@ -204,12 +208,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	}
  unlock:
 	rcu_read_unlock();
-	if (hung_task_show_lock)
+	if (hung_task_show_lock) {
+		printk_prefer_direct_enter();
 		debug_show_all_locks();
+		printk_prefer_direct_exit();
+	}
 
 	if (hung_task_show_all_bt) {
 		hung_task_show_all_bt = false;
+		printk_prefer_direct_enter();
 		trigger_all_cpu_backtrace();
+		printk_prefer_direct_exit();
 	}
 
 	if (hung_task_call_panic)
diff --git a/kernel/panic.c b/kernel/panic.c
index 55b50e052ec3..7d422597403f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -560,6 +560,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
+	printk_prefer_direct_enter();
+
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
 			raw_smp_processor_id(), current->pid, file, line,
@@ -597,6 +599,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 	/* Just a warning, don't kill lockdep. */
 	add_taint(taint, LOCKDEP_STILL_OK);
+
+	printk_prefer_direct_exit();
 }
 
 #ifndef __WARN_FLAGS
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a06999d55278..ed7f738261cc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -362,6 +362,34 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 static DEFINE_MUTEX(syslog_lock);
 
 #ifdef CONFIG_PRINTK
+static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
+
+/**
+ * printk_prefer_direct_enter - cause printk() calls to attempt direct
+ *                              printing to all enabled consoles
+ *
+ * Since it is not possible to call into the console printing code from any
+ * context, there is no guarantee that direct printing will occur.
+ *
+ * This globally effects all printk() callers.
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_enter(void)
+{
+	atomic_inc(&printk_prefer_direct);
+}
+
+/**
+ * printk_prefer_direct_exit - restore printk() behavior
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_exit(void)
+{
+	WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
+}
+
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 0c5d8516516a..d612707c2ed0 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -619,6 +619,7 @@ static void print_cpu_stall(unsigned long gps)
 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
+	printk_prefer_direct_enter();
 	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
 	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
 	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
@@ -656,6 +657,7 @@ static void print_cpu_stall(unsigned long gps)
 	 */
 	set_tsk_need_resched(current);
 	set_preempt_need_resched();
+	printk_prefer_direct_exit();
 }
 
 static void check_cpu_stall(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6bcc5d6a6572..4177645e74d6 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -447,9 +447,11 @@ static int __orderly_reboot(void)
 	ret = run_cmd(reboot_cmd);
 
 	if (ret) {
+		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly reboot: forcing the issue\n");
 		emergency_sync();
 		kernel_restart(NULL);
+		printk_prefer_direct_exit();
 	}
 
 	return ret;
@@ -462,6 +464,7 @@ static int __orderly_poweroff(bool force)
 	ret = run_cmd(poweroff_cmd);
 
 	if (ret && force) {
+		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
 
 		/*
@@ -471,6 +474,7 @@ static int __orderly_poweroff(bool force)
 		 */
 		emergency_sync();
 		kernel_power_off();
+		printk_prefer_direct_exit();
 	}
 
 	return ret;
@@ -528,6 +532,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
  */
 static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 {
+	printk_prefer_direct_enter();
+
 	/*
 	 * We have reached here after the emergency shutdown waiting period has
 	 * expired. This means orderly_poweroff has not been able to shut off
@@ -544,6 +550,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 	 */
 	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
 	emergency_restart();
+
+	printk_prefer_direct_exit();
 }
 
 static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
@@ -582,11 +590,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	static atomic_t allow_proceed = ATOMIC_INIT(1);
 
+	printk_prefer_direct_enter();
+
 	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
 
 	/* Shutdown should be initiated only once. */
 	if (!atomic_dec_and_test(&allow_proceed))
-		return;
+		goto out;
 
 	/*
 	 * Queue a backup emergency shutdown in the event of
@@ -594,6 +604,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 	 */
 	hw_failure_emergency_poweroff(ms_until_forced);
 	orderly_poweroff(true);
+out:
+	printk_prefer_direct_exit();
 }
 EXPORT_SYMBOL_GPL(hw_protection_shutdown);
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9166220457bc..40024e03d422 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		/* Start period for the next softlockup warning. */
 		update_report_ts();
 
+		printk_prefer_direct_enter();
+
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
+
+		printk_prefer_direct_exit();
 	}
 
 	return HRTIMER_RESTART;
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..701f35f0e2d4 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
 
+		printk_prefer_direct_enter();
+
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
 			 this_cpu);
 		print_modules();
@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
 
+		printk_prefer_direct_exit();
+
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
 	}
-- 
cgit v1.2.3


From 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:48 +0206
Subject: printk: add kthread console printers

Create a kthread for each console to perform console printing. During
normal operation (@system_state == SYSTEM_RUNNING), the kthread
printers are responsible for all printing on their respective
consoles.

During non-normal operation, console printing is done as it has been:
within the context of the printk caller or within irqwork triggered
by the printk caller, referred to as direct printing.

Since threaded console printers are responsible for all printing
during normal operation, this also includes messages generated via
deferred printk calls. If direct printing is in effect during a
deferred printk call, the queued irqwork will perform the direct
printing. To make it clear that this is the only time that the
irqwork will perform direct printing, rename the flag
PRINTK_PENDING_OUTPUT to PRINTK_PENDING_DIRECT_OUTPUT.

Threaded console printers synchronize against each other and against
console lockers by taking the console lock for each message that is
printed.

Note that the kthread printers do not care about direct printing.
They will always try to print if new records are available. They can
be blocked by direct printing, but will be woken again once direct
printing is finished.

Console unregistration is a bit tricky because the associated
kthread printer cannot be stopped while the console lock is held.
A policy is implemented that states: whichever task clears
con->thread (under the console lock) is responsible for stopping
the kthread. unregister_console() will clear con->thread while
the console lock is held and then stop the kthread after releasing
the console lock.

For consoles that have implemented the exit() callback, the kthread
is stopped before exit() is called.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-14-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 329 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 307 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ed7f738261cc..e4cdc424c826 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -361,6 +361,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 /* syslog_lock protects syslog_* variables and write access to clear_seq. */
 static DEFINE_MUTEX(syslog_lock);
 
+/*
+ * A flag to signify if printk_activate_kthreads() has already started the
+ * kthread printers. If true, any later registered consoles must start their
+ * own kthread directly. The flag is write protected by the console_lock.
+ */
+static bool printk_kthreads_available;
+
 #ifdef CONFIG_PRINTK
 static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
 
@@ -390,6 +397,39 @@ void printk_prefer_direct_exit(void)
 	WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
 }
 
+/*
+ * Calling printk() always wakes kthread printers so that they can
+ * flush the new message to their respective consoles. Also, if direct
+ * printing is allowed, printk() tries to flush the messages directly.
+ *
+ * Direct printing is allowed in situations when the kthreads
+ * are not available or the system is in a problematic state.
+ *
+ * See the implementation about possible races.
+ */
+static inline bool allow_direct_printing(void)
+{
+	/*
+	 * Checking kthread availability is a possible race because the
+	 * kthread printers can become permanently disabled during runtime.
+	 * However, doing that requires holding the console_lock, so any
+	 * pending messages will be direct printed by console_unlock().
+	 */
+	if (!printk_kthreads_available)
+		return true;
+
+	/*
+	 * Prefer direct printing when the system is in a problematic state.
+	 * The context that sets this state will always see the updated value.
+	 * The other contexts do not care. Anyway, direct printing is just a
+	 * best effort. The direct output is only possible when console_lock
+	 * is not already taken and no kthread printers are actively printing.
+	 */
+	return (system_state > SYSTEM_RUNNING ||
+		oops_in_progress ||
+		atomic_read(&printk_prefer_direct));
+}
+
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -2280,10 +2320,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
 
 	/* If called from the scheduler, we can not call up(). */
-	if (!in_sched) {
+	if (!in_sched && allow_direct_printing()) {
 		/*
 		 * The caller may be holding system-critical or
-		 * timing-sensitive locks. Disable preemption during
+		 * timing-sensitive locks. Disable preemption during direct
 		 * printing of all remaining records to all consoles so that
 		 * this context can return as soon as possible. Hopefully
 		 * another printk() caller will take over the printing.
@@ -2326,6 +2366,8 @@ EXPORT_SYMBOL(_printk);
 
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
 
+static void printk_start_kthread(struct console *con);
+
 #else /* CONFIG_PRINTK */
 
 #define CONSOLE_LOG_MAX		0
@@ -2359,6 +2401,8 @@ static void call_console_driver(struct console *con, const char *text, size_t le
 }
 static bool suppress_message_printing(int level) { return false; }
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
+static void printk_start_kthread(struct console *con) { }
+static bool allow_direct_printing(void) { return true; }
 
 #endif /* CONFIG_PRINTK */
 
@@ -2559,6 +2603,13 @@ void resume_console(void)
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
+
+	/*
+	 * While suspended, new records may have been added to the
+	 * ringbuffer. Wake up the kthread printers to print them.
+	 */
+	wake_up_klogd();
+
 	pr_flush(1000, true);
 }
 
@@ -2577,6 +2628,9 @@ static int console_cpu_notify(unsigned int cpu)
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
+
+		/* Wake kthread printers. Some may have become usable. */
+		wake_up_klogd();
 	}
 	return 0;
 }
@@ -2648,18 +2702,9 @@ static bool abandon_console_lock_in_panic(void)
 	return atomic_read(&panic_cpu) != raw_smp_processor_id();
 }
 
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_lock.
- */
-static inline bool console_is_usable(struct console *con)
+static inline bool __console_is_usable(short flags)
 {
-	if (!(con->flags & CON_ENABLED))
-		return false;
-
-	if (!con->write)
+	if (!(flags & CON_ENABLED))
 		return false;
 
 	/*
@@ -2668,12 +2713,26 @@ static inline bool console_is_usable(struct console *con)
 	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
 	 */
 	if (!cpu_online(raw_smp_processor_id()) &&
-	    !(con->flags & CON_ANYTIME))
+	    !(flags & CON_ANYTIME))
 		return false;
 
 	return true;
 }
 
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires the console_lock.
+ */
+static inline bool console_is_usable(struct console *con)
+{
+	if (!con->write)
+		return false;
+
+	return __console_is_usable(con->flags);
+}
+
 static void __console_unlock(void)
 {
 	console_locked = 0;
@@ -2786,8 +2845,8 @@ skip:
  * were flushed to all usable consoles. A returned false informs the caller
  * that everything was not flushed (either there were no usable consoles or
  * another context has taken over printing or it is a panic situation and this
- * is not the panic CPU). Regardless the reason, the caller should assume it
- * is not useful to immediately try again.
+ * is not the panic CPU or direct printing is not preferred). Regardless the
+ * reason, the caller should assume it is not useful to immediately try again.
  *
  * Requires the console_lock.
  */
@@ -2804,6 +2863,10 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 	*handover = false;
 
 	do {
+		/* Let the kthread printers do the work if they can. */
+		if (!allow_direct_printing())
+			return false;
+
 		any_progress = false;
 
 		for_each_console(con) {
@@ -3018,6 +3081,10 @@ void console_start(struct console *console)
 	console_lock();
 	console->flags |= CON_ENABLED;
 	console_unlock();
+
+	/* Wake the newly enabled kthread printer. */
+	wake_up_klogd();
+
 	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
@@ -3218,6 +3285,8 @@ void register_console(struct console *newcon)
 		nr_ext_console_drivers++;
 
 	newcon->dropped = 0;
+	newcon->thread = NULL;
+
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/* Get a consistent copy of @syslog_seq. */
 		mutex_lock(&syslog_lock);
@@ -3227,6 +3296,10 @@ void register_console(struct console *newcon)
 		/* Begin with next message. */
 		newcon->seq = prb_next_seq(prb);
 	}
+
+	if (printk_kthreads_available)
+		printk_start_kthread(newcon);
+
 	console_unlock();
 	console_sysfs_notify();
 
@@ -3253,6 +3326,7 @@ EXPORT_SYMBOL(register_console);
 
 int unregister_console(struct console *console)
 {
+	struct task_struct *thd;
 	struct console *con;
 	int res;
 
@@ -3293,7 +3367,20 @@ int unregister_console(struct console *console)
 		console_drivers->flags |= CON_CONSDEV;
 
 	console->flags &= ~CON_ENABLED;
+
+	/*
+	 * console->thread can only be cleared under the console lock. But
+	 * stopping the thread must be done without the console lock. The
+	 * task that clears @thread is the task that stops the kthread.
+	 */
+	thd = console->thread;
+	console->thread = NULL;
+
 	console_unlock();
+
+	if (thd)
+		kthread_stop(thd);
+
 	console_sysfs_notify();
 
 	if (console->exit)
@@ -3389,6 +3476,20 @@ static int __init printk_late_init(void)
 }
 late_initcall(printk_late_init);
 
+static int __init printk_activate_kthreads(void)
+{
+	struct console *con;
+
+	console_lock();
+	printk_kthreads_available = true;
+	for_each_console(con)
+		printk_start_kthread(con);
+	console_unlock();
+
+	return 0;
+}
+early_initcall(printk_activate_kthreads);
+
 #if defined CONFIG_PRINTK
 /* If @con is specified, only wait for that console. Otherwise wait for all. */
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
@@ -3463,11 +3564,180 @@ bool pr_flush(int timeout_ms, bool reset_on_progress)
 }
 EXPORT_SYMBOL(pr_flush);
 
+static void __printk_fallback_preferred_direct(void)
+{
+	printk_prefer_direct_enter();
+	pr_err("falling back to preferred direct printing\n");
+	printk_kthreads_available = false;
+}
+
+/*
+ * Enter preferred direct printing, but never exit. Mark console threads as
+ * unavailable. The system is then forever in preferred direct printing and
+ * any printing threads will exit.
+ *
+ * Must *not* be called under console_lock. Use
+ * __printk_fallback_preferred_direct() if already holding console_lock.
+ */
+static void printk_fallback_preferred_direct(void)
+{
+	console_lock();
+	__printk_fallback_preferred_direct();
+	console_unlock();
+}
+
+static bool printer_should_wake(struct console *con, u64 seq)
+{
+	short flags;
+
+	if (kthread_should_stop() || !printk_kthreads_available)
+		return true;
+
+	if (console_suspended)
+		return false;
+
+	/*
+	 * This is an unsafe read from con->flags, but a false positive is
+	 * not a problem. Worst case it would allow the printer to wake up
+	 * although it is disabled. But the printer will notice that when
+	 * attempting to print and instead go back to sleep.
+	 */
+	flags = data_race(READ_ONCE(con->flags));
+
+	if (!__console_is_usable(flags))
+		return false;
+
+	return prb_read_valid(prb, seq, NULL);
+}
+
+static int printk_kthread_func(void *data)
+{
+	struct console *con = data;
+	char *dropped_text = NULL;
+	char *ext_text = NULL;
+	bool handover;
+	u64 seq = 0;
+	char *text;
+	int error;
+
+	text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+	if (!text) {
+		con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
+		printk_fallback_preferred_direct();
+		goto out;
+	}
+
+	if (con->flags & CON_EXTENDED) {
+		ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+		if (!ext_text) {
+			con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n");
+			printk_fallback_preferred_direct();
+			goto out;
+		}
+	} else {
+		dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
+		if (!dropped_text) {
+			con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n");
+			printk_fallback_preferred_direct();
+			goto out;
+		}
+	}
+
+	con_printk(KERN_INFO, con, "printing thread started\n");
+
+	for (;;) {
+		/*
+		 * Guarantee this task is visible on the waitqueue before
+		 * checking the wake condition.
+		 *
+		 * The full memory barrier within set_current_state() of
+		 * prepare_to_wait_event() pairs with the full memory barrier
+		 * within wq_has_sleeper().
+		 *
+		 * This pairs with __wake_up_klogd:A.
+		 */
+		error = wait_event_interruptible(log_wait,
+				printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */
+
+		if (kthread_should_stop() || !printk_kthreads_available)
+			break;
+
+		if (error)
+			continue;
+
+		console_lock();
+
+		if (console_suspended) {
+			up_console_sem();
+			continue;
+		}
+
+		if (!console_is_usable(con)) {
+			__console_unlock();
+			continue;
+		}
+
+		/*
+		 * Even though the printk kthread is always preemptible, it is
+		 * still not allowed to call cond_resched() from within
+		 * console drivers. The task may become non-preemptible in the
+		 * console driver call chain. For example, vt_console_print()
+		 * takes a spinlock and then can call into fbcon_redraw(),
+		 * which can conditionally invoke cond_resched().
+		 */
+		console_may_schedule = 0;
+		console_emit_next_record(con, text, ext_text, dropped_text, &handover);
+		if (handover)
+			continue;
+
+		seq = con->seq;
+
+		__console_unlock();
+	}
+
+	con_printk(KERN_INFO, con, "printing thread stopped\n");
+out:
+	kfree(dropped_text);
+	kfree(ext_text);
+	kfree(text);
+
+	console_lock();
+	/*
+	 * If this kthread is being stopped by another task, con->thread will
+	 * already be NULL. That is fine. The important thing is that it is
+	 * NULL after the kthread exits.
+	 */
+	con->thread = NULL;
+	console_unlock();
+
+	return 0;
+}
+
+/* Must be called under console_lock. */
+static void printk_start_kthread(struct console *con)
+{
+	/*
+	 * Do not start a kthread if there is no write() callback. The
+	 * kthreads assume the write() callback exists.
+	 */
+	if (!con->write)
+		return;
+
+	con->thread = kthread_run(printk_kthread_func, con,
+				  "pr/%s%d", con->name, con->index);
+	if (IS_ERR(con->thread)) {
+		con->thread = NULL;
+		con_printk(KERN_ERR, con, "unable to start printing thread\n");
+		__printk_fallback_preferred_direct();
+		return;
+	}
+}
+
 /*
  * Delayed printk version, for scheduler-internal messages:
  */
-#define PRINTK_PENDING_WAKEUP	0x01
-#define PRINTK_PENDING_OUTPUT	0x02
+#define PRINTK_PENDING_WAKEUP		0x01
+#define PRINTK_PENDING_DIRECT_OUTPUT	0x02
 
 static DEFINE_PER_CPU(int, printk_pending);
 
@@ -3475,10 +3745,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
 	int pending = this_cpu_xchg(printk_pending, 0);
 
-	if (pending & PRINTK_PENDING_OUTPUT) {
+	if (pending & PRINTK_PENDING_DIRECT_OUTPUT) {
+		printk_prefer_direct_enter();
+
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
+
+		printk_prefer_direct_exit();
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
@@ -3503,10 +3777,11 @@ static void __wake_up_klogd(int val)
 	 * prepare_to_wait_event(), which is called after ___wait_event() adds
 	 * the waiter but before it has checked the wait condition.
 	 *
-	 * This pairs with devkmsg_read:A and syslog_print:A.
+	 * This pairs with devkmsg_read:A, syslog_print:A, and
+	 * printk_kthread_func:A.
 	 */
 	if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
-	    (val & PRINTK_PENDING_OUTPUT)) {
+	    (val & PRINTK_PENDING_DIRECT_OUTPUT)) {
 		this_cpu_or(printk_pending, val);
 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
@@ -3524,7 +3799,17 @@ void defer_console_output(void)
 	 * New messages may have been added directly to the ringbuffer
 	 * using vprintk_store(), so wake any waiters as well.
 	 */
-	__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+	int val = PRINTK_PENDING_WAKEUP;
+
+	/*
+	 * Make sure that some context will print the messages when direct
+	 * printing is allowed. This happens in situations when the kthreads
+	 * may not be as reliable or perhaps unusable.
+	 */
+	if (allow_direct_printing())
+		val |= PRINTK_PENDING_DIRECT_OUTPUT;
+
+	__wake_up_klogd(val);
 }
 
 void printk_trigger_flush(void)
-- 
cgit v1.2.3


From df86ca0d2f0fa6be525a25b0b3d836d361f85754 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 20 Apr 2022 20:39:43 -0700
Subject: bpf: Allow attach TRACING programs through LINK_CREATE command

Allow attaching BTF-aware TRACING programs, previously attachable only
through BPF_RAW_TRACEPOINT_OPEN command, through LINK_CREATE command:

  - BTF-aware raw tracepoints (tp_btf in libbpf lingo);
  - fentry/fexit/fmod_ret programs;
  - BPF LSM programs.

This change converges all bpf_link-based attachments under LINK_CREATE
command allowing to further extend the API with features like BPF cookie
under "multiplexed" link_create section of bpf_attr.

Non-BTF-aware raw tracepoints are left under BPF_RAW_TRACEPOINT_OPEN,
but there is nothing preventing opening them up to LINK_CREATE as well.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Kuifeng Lee <kuifeng@fb.com>
Link: https://lore.kernel.org/bpf/20220421033945.3602803-2-andrii@kernel.org
---
 kernel/bpf/syscall.c | 110 ++++++++++++++++++++++++++-------------------------
 1 file changed, 56 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e9621cfa09f2..e9e3e49c0eb7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3030,66 +3030,45 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 }
 #endif /* CONFIG_PERF_EVENTS */
 
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+				  const char __user *user_tp_name)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_raw_tp_link *link;
 	struct bpf_raw_event_map *btp;
-	struct bpf_prog *prog;
 	const char *tp_name;
 	char buf[128];
 	int err;
 
-	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
-		return -EINVAL;
-
-	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
 	switch (prog->type) {
 	case BPF_PROG_TYPE_TRACING:
 	case BPF_PROG_TYPE_EXT:
 	case BPF_PROG_TYPE_LSM:
-		if (attr->raw_tracepoint.name) {
+		if (user_tp_name)
 			/* The attach point for this category of programs
 			 * should be specified via btf_id during program load.
 			 */
-			err = -EINVAL;
-			goto out_put_prog;
-		}
+			return -EINVAL;
 		if (prog->type == BPF_PROG_TYPE_TRACING &&
 		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
 			tp_name = prog->aux->attach_func_name;
 			break;
 		}
-		err = bpf_tracing_prog_attach(prog, 0, 0);
-		if (err >= 0)
-			return err;
-		goto out_put_prog;
+		return bpf_tracing_prog_attach(prog, 0, 0);
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
-		if (strncpy_from_user(buf,
-				      u64_to_user_ptr(attr->raw_tracepoint.name),
-				      sizeof(buf) - 1) < 0) {
-			err = -EFAULT;
-			goto out_put_prog;
-		}
+		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+			return -EFAULT;
 		buf[sizeof(buf) - 1] = 0;
 		tp_name = buf;
 		break;
 	default:
-		err = -EINVAL;
-		goto out_put_prog;
+		return -EINVAL;
 	}
 
 	btp = bpf_get_raw_tracepoint(tp_name);
-	if (!btp) {
-		err = -ENOENT;
-		goto out_put_prog;
-	}
+	if (!btp)
+		return -ENOENT;
 
 	link = kzalloc(sizeof(*link), GFP_USER);
 	if (!link) {
@@ -3116,11 +3095,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 
 out_put_btp:
 	bpf_put_raw_tracepoint(btp);
-out_put_prog:
-	bpf_prog_put(prog);
 	return err;
 }
 
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	int fd;
+
+	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+	if (fd < 0)
+		bpf_prog_put(prog);
+	return fd;
+}
+
 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 					     enum bpf_attach_type attach_type)
 {
@@ -3189,7 +3186,13 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_SETSOCKOPT:
 		return BPF_PROG_TYPE_CGROUP_SOCKOPT;
 	case BPF_TRACE_ITER:
+	case BPF_TRACE_RAW_TP:
+	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FEXIT:
+	case BPF_MODIFY_RETURN:
 		return BPF_PROG_TYPE_TRACING;
+	case BPF_LSM_MAC:
+		return BPF_PROG_TYPE_LSM;
 	case BPF_SK_LOOKUP:
 		return BPF_PROG_TYPE_SK_LOOKUP;
 	case BPF_XDP:
@@ -4246,21 +4249,6 @@ err_put:
 	return err;
 }
 
-static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
-				   struct bpf_prog *prog)
-{
-	if (attr->link_create.attach_type != prog->expected_attach_type)
-		return -EINVAL;
-
-	if (prog->expected_attach_type == BPF_TRACE_ITER)
-		return bpf_iter_link_attach(attr, uattr, prog);
-	else if (prog->type == BPF_PROG_TYPE_EXT)
-		return bpf_tracing_prog_attach(prog,
-					       attr->link_create.target_fd,
-					       attr->link_create.target_btf_id);
-	return -EINVAL;
-}
-
 #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
 static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 {
@@ -4282,15 +4270,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	switch (prog->type) {
 	case BPF_PROG_TYPE_EXT:
-		ret = tracing_bpf_link_attach(attr, uattr, prog);
-		goto out;
+		break;
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
 		if (attr->link_create.attach_type != BPF_PERF_EVENT) {
 			ret = -EINVAL;
 			goto out;
 		}
-		ptype = prog->type;
 		break;
 	case BPF_PROG_TYPE_KPROBE:
 		if (attr->link_create.attach_type != BPF_PERF_EVENT &&
@@ -4298,7 +4284,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = -EINVAL;
 			goto out;
 		}
-		ptype = prog->type;
 		break;
 	default:
 		ptype = attach_type_to_prog_type(attr->link_create.attach_type);
@@ -4309,7 +4294,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		break;
 	}
 
-	switch (ptype) {
+	switch (prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4319,8 +4304,25 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_EXT:
+		ret = bpf_tracing_prog_attach(prog,
+					      attr->link_create.target_fd,
+					      attr->link_create.target_btf_id);
+		break;
+	case BPF_PROG_TYPE_LSM:
 	case BPF_PROG_TYPE_TRACING:
-		ret = tracing_bpf_link_attach(attr, uattr, prog);
+		if (attr->link_create.attach_type != prog->expected_attach_type) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+			ret = bpf_raw_tp_link_attach(prog, NULL);
+		else if (prog->expected_attach_type == BPF_TRACE_ITER)
+			ret = bpf_iter_link_attach(attr, uattr, prog);
+		else
+			ret = bpf_tracing_prog_attach(prog,
+						      attr->link_create.target_fd,
+						      attr->link_create.target_btf_id);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_SK_LOOKUP:
-- 
cgit v1.2.3


From 9ea4dcf49878bb9546b8fa9319dcbdc9b7ee20f8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 22 Apr 2022 15:58:11 -0700
Subject: PM: CXL: Disable suspend

The CXL specification claims S3 support at a hardware level, but at a
system software level there are some missing pieces. Section 9.4 (CXL
2.0) rightly claims that "CXL mem adapters may need aux power to retain
memory context across S3", but there is no enumeration mechanism for the
OS to determine if a given adapter has that support. Moreover the save
state and resume image for the system may inadvertantly end up in a CXL
device that needs to be restored before the save state is recoverable.
I.e. a circular dependency that is not resolvable without a third party
save-area.

Arrange for the cxl_mem driver to fail S3 attempts. This still nominaly
allows for suspend, but requires unbinding all CXL memory devices before
the suspend to ensure the typical DRAM flow is taken. The cxl_mem unbind
flow is intended to also tear down all CXL memory regions associated
with a given cxl_memdev.

It is reasonable to assume that any device participating in a System RAM
range published in the EFI memory map is covered by aux power and
save-area outside the device itself. So this restriction can be
minimized in the future once pre-existing region enumeration support
arrives, and perhaps a spec update to clarify if the EFI memory map is
sufficent for determining the range of devices managed by
platform-firmware for S3 support.

Per Rafael, if the CXL configuration prevents suspend then it should
fail early before tasks are frozen, and mem_sleep should stop showing
'mem' as an option [1]. Effectively CXL augments the platform suspend
->valid() op since, for example, the ACPI ops are not aware of the CXL /
PCI dependencies. Given the split role of platform firmware vs OS
provisioned CXL memory it is up to the cxl_mem driver to determine if
the CXL configuration has elements that platform firmware may not be
prepared to restore.

Link: https://lore.kernel.org/r/CAJZ5v0hGVN_=3iU8OLpHY3Ak35T5+JcBM-qs8SbojKrpd0VXsA@mail.gmail.com [1]
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Len Brown <len.brown@intel.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/165066828317.3907920.5690432272182042556.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/power/hibernate.c | 2 +-
 kernel/power/main.c      | 5 ++++-
 kernel/power/suspend.c   | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 938d5c78b421..20a66bf9f465 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -83,7 +83,7 @@ bool hibernation_available(void)
 {
 	return nohibernate == 0 &&
 		!security_locked_down(LOCKDOWN_HIBERNATION) &&
-		!secretmem_active();
+		!secretmem_active() && !cxl_mem_active();
 }
 
 /**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7e646079fbeb..3e6be1c33e0b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -127,7 +127,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
 	char *s = buf;
 	suspend_state_t i;
 
-	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+		if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+			continue;
 		if (mem_sleep_states[i]) {
 			const char *label = mem_sleep_states[i];
 
@@ -136,6 +138,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
 			else
 				s += sprintf(s, "%s ", label);
 		}
+	}
 
 	/* Convert the last space to a newline if needed. */
 	if (s != buf)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6fcdee7e87a5..827075944d28 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -236,7 +236,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
 
 static bool sleep_state_supported(suspend_state_t state)
 {
-	return state == PM_SUSPEND_TO_IDLE || valid_state(state);
+	return state == PM_SUSPEND_TO_IDLE ||
+	       (valid_state(state) && !cxl_mem_active());
 }
 
 static int platform_suspend_prepare(suspend_state_t state)
-- 
cgit v1.2.3


From 867a448d587e7fa845bceaf4ee1c632448f2a9fa Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 22 Apr 2022 15:03:15 +0300
Subject: fsnotify: pass flags argument to fsnotify_alloc_group()

Add flags argument to fsnotify_alloc_group(), define and use the flag
FSNOTIFY_GROUP_USER in inotify and fanotify instead of the helper
fsnotify_alloc_user_group() to indicate user allocation.

Although the flag FSNOTIFY_GROUP_USER is currently not used after group
allocation, we store the flags argument in the group struct for future
use of other group flags.

Link: https://lore.kernel.org/r/20220422120327.3459282-5-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 kernel/audit_fsnotify.c | 3 ++-
 kernel/audit_tree.c     | 2 +-
 kernel/audit_watch.c    | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 02348b48447c..35fe149586c8 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -181,7 +181,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
 
 static int __init audit_fsnotify_init(void)
 {
-	audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+	audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+						    0);
 	if (IS_ERR(audit_fsnotify_group)) {
 		audit_fsnotify_group = NULL;
 		audit_panic("cannot create audit fsnotify group");
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e7315d487163..b5c02f8573fe 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1074,7 +1074,7 @@ static int __init audit_tree_init(void)
 
 	audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
 
-	audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+	audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
 	if (IS_ERR(audit_tree_group))
 		audit_panic("cannot initialize fsnotify group for rectree watches");
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 713b256be944..4b0957aa2cd4 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 
 static int __init audit_watch_init(void)
 {
-	audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+	audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
 	if (IS_ERR(audit_watch_group)) {
 		audit_watch_group = NULL;
 		audit_panic("cannot create audit fsnotify group");
-- 
cgit v1.2.3


From f3010343d9e119da35ee864b3a28993bb5c78ed7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 22 Apr 2022 15:03:16 +0300
Subject: fsnotify: make allow_dups a property of the group

Instead of passing the allow_dups argument to fsnotify_add_mark()
as an argument, define the group flag FSNOTIFY_GROUP_DUPS to express
the allow_dups behavior and set this behavior at group creation time
for all calls of fsnotify_add_mark().

Rename the allow_dups argument to generic add_flags argument for future
use.

Link: https://lore.kernel.org/r/20220422120327.3459282-6-amir73il@gmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 kernel/audit_fsnotify.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 35fe149586c8..6432a37ac1c9 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -100,7 +100,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 	audit_update_mark(audit_mark, dentry->d_inode);
 	audit_mark->rule = krule;
 
-	ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
+	ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
 	if (ret < 0) {
 		fsnotify_put_mark(&audit_mark->mark);
 		audit_mark = ERR_PTR(ret);
@@ -182,7 +182,7 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
 static int __init audit_fsnotify_init(void)
 {
 	audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
-						    0);
+						    FSNOTIFY_GROUP_DUPS);
 	if (IS_ERR(audit_fsnotify_group)) {
 		audit_fsnotify_group = NULL;
 		audit_panic("cannot create audit fsnotify group");
-- 
cgit v1.2.3


From 960bdff24ce802e38df918ebfcbfa62744d6ae22 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 22 Apr 2022 15:03:19 +0300
Subject: audit: use fsnotify group lock helpers

audit inode marks pin the inode so there is no need to set the
FSNOTIFY_GROUP_NOFS flag.

Link: https://lore.kernel.org/r/20220422120327.3459282-9-amir73il@gmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220321112310.vpr7oxro2xkz5llh@quack3.lan/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 kernel/audit_tree.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b5c02f8573fe..e867c17d3f84 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
 	struct audit_chunk *new;
 	int size;
 
-	mutex_lock(&audit_tree_group->mark_mutex);
+	fsnotify_group_lock(audit_tree_group);
 	/*
 	 * mark_mutex stabilizes chunk attached to the mark so we can check
 	 * whether it didn't change while we've dropped hash_lock.
@@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
 		replace_mark_chunk(mark, NULL);
 		spin_unlock(&hash_lock);
 		fsnotify_detach_mark(mark);
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		audit_mark_put_chunk(chunk);
 		fsnotify_free_mark(mark);
 		return;
@@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
 	 */
 	replace_chunk(new, chunk);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&audit_tree_group->mark_mutex);
+	fsnotify_group_unlock(audit_tree_group);
 	audit_mark_put_chunk(chunk);
 	return;
 
 out_mutex:
-	mutex_unlock(&audit_tree_group->mark_mutex);
+	fsnotify_group_unlock(audit_tree_group);
 }
 
 /* Call with group->mark_mutex held, releases it */
@@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	struct audit_chunk *chunk = alloc_chunk(1);
 
 	if (!chunk) {
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		return -ENOMEM;
 	}
 
 	mark = alloc_mark();
 	if (!mark) {
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		kfree(chunk);
 		return -ENOMEM;
 	}
 
 	if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		fsnotify_put_mark(mark);
 		kfree(chunk);
 		return -ENOSPC;
@@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
 		fsnotify_detach_mark(mark);
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		fsnotify_free_mark(mark);
 		fsnotify_put_mark(mark);
 		kfree(chunk);
@@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 	 */
 	insert_hash(chunk);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&audit_tree_group->mark_mutex);
+	fsnotify_group_unlock(audit_tree_group);
 	/*
 	 * Drop our initial reference. When mark we point to is getting freed,
 	 * we get notification through ->freeing_mark callback and cleanup
@@ -462,7 +462,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	struct audit_node *p;
 	int n;
 
-	mutex_lock(&audit_tree_group->mark_mutex);
+	fsnotify_group_lock(audit_tree_group);
 	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
 	if (!mark)
 		return create_chunk(inode, tree);
@@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	for (n = 0; n < old->count; n++) {
 		if (old->owners[n].owner == tree) {
 			spin_unlock(&hash_lock);
-			mutex_unlock(&audit_tree_group->mark_mutex);
+			fsnotify_group_unlock(audit_tree_group);
 			fsnotify_put_mark(mark);
 			return 0;
 		}
@@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	chunk = alloc_chunk(old->count + 1);
 	if (!chunk) {
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		fsnotify_put_mark(mark);
 		return -ENOMEM;
 	}
@@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	spin_lock(&hash_lock);
 	if (tree->goner) {
 		spin_unlock(&hash_lock);
-		mutex_unlock(&audit_tree_group->mark_mutex);
+		fsnotify_group_unlock(audit_tree_group);
 		fsnotify_put_mark(mark);
 		kfree(chunk);
 		return 0;
@@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	 */
 	replace_chunk(chunk, old);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&audit_tree_group->mark_mutex);
+	fsnotify_group_unlock(audit_tree_group);
 	fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
 	audit_mark_put_chunk(old);
 
@@ -1044,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
 {
 	struct audit_chunk *chunk;
 
-	mutex_lock(&mark->group->mark_mutex);
+	fsnotify_group_lock(mark->group);
 	spin_lock(&hash_lock);
 	chunk = mark_chunk(mark);
 	replace_mark_chunk(mark, NULL);
 	spin_unlock(&hash_lock);
-	mutex_unlock(&mark->group->mark_mutex);
+	fsnotify_group_unlock(mark->group);
 	if (chunk) {
 		evict_chunk(chunk);
 		audit_mark_put_chunk(chunk);
-- 
cgit v1.2.3


From 62c1256d544747b38e77ca9b5bfe3a26f9592576 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 23 Apr 2022 00:14:46 +1000
Subject: timers/nohz: Switch to ONESHOT_STOPPED in the low-res handler when
 the tick is stopped

When tick_nohz_stop_tick() stops the tick and high resolution timers are
disabled, then the clock event device is not put into ONESHOT_STOPPED
mode. This can lead to spurious timer interrupts with some clock event
device drivers that don't shut down entirely after firing.

Eliminate these by putting the device into ONESHOT_STOPPED mode at points
where it is not being reprogrammed. When there are no timers active, then
tick_program_event() with KTIME_MAX can be used to stop the device. When
there is a timer active, the device can be stopped at the next tick (any
new timer added by timers will reprogram the tick).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220422141446.915024-1-npiggin@gmail.com
---
 kernel/time/tick-sched.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2d76c91b85de..b1b105db31eb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -928,6 +928,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	if (unlikely(expires == KTIME_MAX)) {
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 			hrtimer_cancel(&ts->sched_timer);
+		else
+			tick_program_event(KTIME_MAX, 1);
 		return;
 	}
 
@@ -1364,9 +1366,15 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	tick_sched_do_timer(ts, now);
 	tick_sched_handle(ts, regs);
 
-	/* No need to reprogram if we are running tickless  */
-	if (unlikely(ts->tick_stopped))
+	if (unlikely(ts->tick_stopped)) {
+		/*
+		 * The clockevent device is not reprogrammed, so change the
+		 * clock event device to ONESHOT_STOPPED to avoid spurious
+		 * interrupts on devices which might not be truly one shot.
+		 */
+		tick_program_event(KTIME_MAX, 1);
 		return;
+	}
 
 	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
 	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
-- 
cgit v1.2.3


From 1adb4d7ad3a585b451f5cf6b0a90c5917af3eac5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 22 Apr 2022 12:04:18 +0100
Subject: genirq/matrix: Remove redundant assignment to variable 'end'

Variable end is being initialized with a value that is never read, it
is being re-assigned later with the same value. The initialization is
redundant and can be removed.

Cleans up clang scan build warning:
kernel/irq/matrix.c:289:25: warning: Value stored to 'end' during its
initialization is never read [deadcode.DeadStores]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20220422110418.1264778-1-colin.i.king@gmail.com
---
 kernel/irq/matrix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index bbfb26489aa1..1698e77645ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -286,7 +286,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
 int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
 			     unsigned int *mapped_cpu)
 {
-	unsigned int bit, cpu, end = m->alloc_end;
+	unsigned int bit, cpu, end;
 	struct cpumap *cm;
 
 	if (cpumask_empty(msk))
-- 
cgit v1.2.3


From a467257ffe4bdb13eacddec0137013f6a1140b81 Mon Sep 17 00:00:00 2001
From: yingelin <yingelin@huawei.com>
Date: Sun, 24 Apr 2022 10:57:40 +0800
Subject: kernel/kexec_core: move kexec_core sysctls into its own file

This move the kernel/kexec_core.c respective sysctls to its own file.

kernel/sysctl.c has grown to an insane mess, We move sysctls to places
where features actually belong to improve the readability and reduce
merge conflicts. At the same time, the proc-sysctl maintainers can easily
care about the core logic other than the sysctl knobs added for some feature.

We already moved all filesystem sysctls out. This patch is part of the effort
to move kexec related sysctls out.

Signed-off-by: yingelin <yingelin@huawei.com>
Acked-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/kexec_core.c | 22 ++++++++++++++++++++++
 kernel/sysctl.c     | 13 -------------
 2 files changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 68480f731192..a0456baf52cc 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -936,6 +936,28 @@ int kimage_load_segment(struct kimage *image,
 struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
 int kexec_load_disabled;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kexec_core_sysctls[] = {
+	{
+		.procname	= "kexec_load_disabled",
+		.data		= &kexec_load_disabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
+static int __init kexec_core_sysctl_init(void)
+{
+	register_sysctl_init("kernel", kexec_core_sysctls);
+	return 0;
+}
+late_initcall(kexec_core_sysctl_init);
+#endif
 
 /*
  * No panic_cpu check version of crash_kexec().  This function is called
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b60345cbadf0..0f3cb61a2e39 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,7 +61,6 @@
 #include <linux/capability.h>
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
-#include <linux/kexec.h>
 #include <linux/mount.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/pid.h>
@@ -1712,18 +1711,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= tracepoint_printk_sysctl,
 	},
 #endif
-#ifdef CONFIG_KEXEC_CORE
-	{
-		.procname	= "kexec_load_disabled",
-		.data		= &kexec_load_disabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		/* only handle a transition from default "0" to "1" */
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_MODULES
 	{
 		.procname	= "modprobe",
-- 
cgit v1.2.3


From d9d31cf88702ae071bec033e5c8714048aa71285 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 25 Apr 2022 15:04:48 -0700
Subject: bpf: Use bpf_prog_run_array_cg_flags everywhere

Rename bpf_prog_run_array_cg_flags to bpf_prog_run_array_cg and
use it everywhere. check_return_code already enforces sane
return ranges for all cgroup types. (only egress and bind hooks have
uncanonical return ranges, the rest is using [0, 1])

No functional changes.

v2:
- 'func_ret & 1' under explicit test (Andrii & Martin)

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220425220448.3669032-1-sdf@google.com
---
 kernel/bpf/cgroup.c | 72 ++++++++++++++++++-----------------------------------
 1 file changed, 24 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 0cb6211fcb58..afb414b26d01 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -25,50 +25,18 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 /* __always_inline is necessary to prevent indirect call through run_prog
  * function pointer.
  */
-static __always_inline int
-bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
-			    enum cgroup_bpf_attach_type atype,
-			    const void *ctx, bpf_prog_run_fn run_prog,
-			    int retval, u32 *ret_flags)
-{
-	const struct bpf_prog_array_item *item;
-	const struct bpf_prog *prog;
-	const struct bpf_prog_array *array;
-	struct bpf_run_ctx *old_run_ctx;
-	struct bpf_cg_run_ctx run_ctx;
-	u32 func_ret;
-
-	run_ctx.retval = retval;
-	migrate_disable();
-	rcu_read_lock();
-	array = rcu_dereference(cgrp->effective[atype]);
-	item = &array->items[0];
-	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
-	while ((prog = READ_ONCE(item->prog))) {
-		run_ctx.prog_item = item;
-		func_ret = run_prog(prog, ctx);
-		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
-			run_ctx.retval = -EPERM;
-		*(ret_flags) |= (func_ret >> 1);
-		item++;
-	}
-	bpf_reset_run_ctx(old_run_ctx);
-	rcu_read_unlock();
-	migrate_enable();
-	return run_ctx.retval;
-}
-
 static __always_inline int
 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 		      enum cgroup_bpf_attach_type atype,
 		      const void *ctx, bpf_prog_run_fn run_prog,
-		      int retval)
+		      int retval, u32 *ret_flags)
 {
 	const struct bpf_prog_array_item *item;
 	const struct bpf_prog *prog;
 	const struct bpf_prog_array *array;
 	struct bpf_run_ctx *old_run_ctx;
 	struct bpf_cg_run_ctx run_ctx;
+	u32 func_ret;
 
 	run_ctx.retval = retval;
 	migrate_disable();
@@ -78,7 +46,12 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	while ((prog = READ_ONCE(item->prog))) {
 		run_ctx.prog_item = item;
-		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
+		func_ret = run_prog(prog, ctx);
+		if (ret_flags) {
+			*(ret_flags) |= (func_ret >> 1);
+			func_ret &= 1;
+		}
+		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
 			run_ctx.retval = -EPERM;
 		item++;
 	}
@@ -1144,9 +1117,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 		u32 flags = 0;
 		bool cn;
 
-		ret = bpf_prog_run_array_cg_flags(
-			&cgrp->bpf, atype,
-			skb, __bpf_prog_run_save_cb, 0, &flags);
+		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+					    __bpf_prog_run_save_cb, 0, &flags);
 
 		/* Return values of CGROUP EGRESS BPF programs are:
 		 *   0: drop packet
@@ -1172,7 +1144,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 			ret = (cn ? NET_XMIT_DROP : ret);
 	} else {
 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
-					    skb, __bpf_prog_run_save_cb, 0);
+					    skb, __bpf_prog_run_save_cb, 0,
+					    NULL);
 		if (ret && !IS_ERR_VALUE((long)ret))
 			ret = -EFAULT;
 	}
@@ -1202,7 +1175,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
-	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+				     NULL);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -1247,8 +1221,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype,
-					   &ctx, bpf_prog_run, 0, flags);
+	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+				     0, flags);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
@@ -1275,7 +1249,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 
 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
-				     0);
+				     0, NULL);
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 
@@ -1292,7 +1266,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+				    NULL);
 	rcu_read_unlock();
 
 	return ret;
@@ -1457,7 +1432,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
-	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+				    NULL);
 	rcu_read_unlock();
 
 	kfree(ctx.cur_val);
@@ -1550,7 +1526,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	lock_sock(sk);
 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
-				    &ctx, bpf_prog_run, 0);
+				    &ctx, bpf_prog_run, 0, NULL);
 	release_sock(sk);
 
 	if (ret)
@@ -1650,7 +1626,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	lock_sock(sk);
 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
-				    &ctx, bpf_prog_run, retval);
+				    &ctx, bpf_prog_run, retval, NULL);
 	release_sock(sk);
 
 	if (ret < 0)
@@ -1699,7 +1675,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
 	 */
 
 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
-				    &ctx, bpf_prog_run, retval);
+				    &ctx, bpf_prog_run, retval, NULL);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 61df10c7799e27807ad5e459eec9d77cddf8bf45 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:49 +0530
Subject: bpf: Allow storing unreferenced kptr in map

This commit introduces a new pointer type 'kptr' which can be embedded
in a map value to hold a PTR_TO_BTF_ID stored by a BPF program during
its invocation. When storing such a kptr, BPF program's PTR_TO_BTF_ID
register must have the same type as in the map value's BTF, and loading
a kptr marks the destination register as PTR_TO_BTF_ID with the correct
kernel BTF and BTF ID.

Such kptr are unreferenced, i.e. by the time another invocation of the
BPF program loads this pointer, the object which the pointer points to
may not longer exist. Since PTR_TO_BTF_ID loads (using BPF_LDX) are
patched to PROBE_MEM loads by the verifier, it would safe to allow user
to still access such invalid pointer, but passing such pointers into
BPF helpers and kfuncs should not be permitted. A future patch in this
series will close this gap.

The flexibility offered by allowing programs to dereference such invalid
pointers while being safe at runtime frees the verifier from doing
complex lifetime tracking. As long as the user may ensure that the
object remains valid, it can ensure data read by it from the kernel
object is valid.

The user indicates that a certain pointer must be treated as kptr
capable of accepting stores of PTR_TO_BTF_ID of a certain type, by using
a BTF type tag 'kptr' on the pointed to type of the pointer. Then, this
information is recorded in the object BTF which will be passed into the
kernel by way of map's BTF information. The name and kind from the map
value BTF is used to look up the in-kernel type, and the actual BTF and
BTF ID is recorded in the map struct in a new kptr_off_tab member. For
now, only storing pointers to structs is permitted.

An example of this specification is shown below:

	#define __kptr __attribute__((btf_type_tag("kptr")))

	struct map_value {
		...
		struct task_struct __kptr *task;
		...
	};

Then, in a BPF program, user may store PTR_TO_BTF_ID with the type
task_struct into the map, and then load it later.

Note that the destination register is marked PTR_TO_BTF_ID_OR_NULL, as
the verifier cannot know whether the value is NULL or not statically, it
must treat all potential loads at that map value offset as loading a
possibly NULL pointer.

Only BPF_LDX, BPF_STX, and BPF_ST (with insn->imm = 0 to denote NULL)
are allowed instructions that can access such a pointer. On BPF_LDX, the
destination register is updated to be a PTR_TO_BTF_ID, and on BPF_STX,
it is checked whether the source register type is a PTR_TO_BTF_ID with
same BTF type as specified in the map BTF. The access size must always
be BPF_DW.

For the map in map support, the kptr_off_tab for outer map is copied
from the inner map's kptr_off_tab. It was chosen to do a deep copy
instead of introducing a refcount to kptr_off_tab, because the copy only
needs to be done when paramterizing using inner_map_fd in the map in map
case, hence would be unnecessary for all other users.

It is not permitted to use MAP_FREEZE command and mmap for BPF map
having kptrs, similar to the bpf_timer case. A kptr also requires that
BPF program has both read and write access to the map (hence both
BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG are disallowed).

Note that check_map_access must be called from both
check_helper_mem_access and for the BPF instructions, hence the kptr
check must distinguish between ACCESS_DIRECT and ACCESS_HELPER, and
reject ACCESS_HELPER cases. We rename stack_access_src to bpf_access_src
and reuse it for this purpose.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-2-memxor@gmail.com
---
 kernel/bpf/btf.c        | 167 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/map_in_map.c |   5 +-
 kernel/bpf/syscall.c    | 103 ++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c   | 161 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 401 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0493310d981f..563ac61e6d6b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3166,9 +3166,16 @@ static void btf_struct_log(struct btf_verifier_env *env,
 enum btf_field_type {
 	BTF_FIELD_SPIN_LOCK,
 	BTF_FIELD_TIMER,
+	BTF_FIELD_KPTR,
+};
+
+enum {
+	BTF_FIELD_IGNORE = 0,
+	BTF_FIELD_FOUND  = 1,
 };
 
 struct btf_field_info {
+	u32 type_id;
 	u32 off;
 };
 
@@ -3176,29 +3183,57 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
 			   u32 off, int sz, struct btf_field_info *info)
 {
 	if (!__btf_type_is_struct(t))
-		return 0;
+		return BTF_FIELD_IGNORE;
 	if (t->size != sz)
-		return 0;
-	if (info->off != -ENOENT)
-		/* only one such field is allowed */
-		return -E2BIG;
+		return BTF_FIELD_IGNORE;
 	info->off = off;
-	return 0;
+	return BTF_FIELD_FOUND;
+}
+
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+			 u32 off, int sz, struct btf_field_info *info)
+{
+	u32 res_id;
+
+	/* For PTR, sz is always == 8 */
+	if (!btf_type_is_ptr(t))
+		return BTF_FIELD_IGNORE;
+	t = btf_type_by_id(btf, t->type);
+
+	if (!btf_type_is_type_tag(t))
+		return BTF_FIELD_IGNORE;
+	/* Reject extra tags */
+	if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+		return -EINVAL;
+	if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+		return -EINVAL;
+
+	/* Get the base type */
+	t = btf_type_skip_modifiers(btf, t->type, &res_id);
+	/* Only pointer to struct is allowed */
+	if (!__btf_type_is_struct(t))
+		return -EINVAL;
+
+	info->type_id = res_id;
+	info->off = off;
+	return BTF_FIELD_FOUND;
 }
 
 static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
 				 const char *name, int sz, int align,
 				 enum btf_field_type field_type,
-				 struct btf_field_info *info)
+				 struct btf_field_info *info, int info_cnt)
 {
 	const struct btf_member *member;
+	struct btf_field_info tmp;
+	int ret, idx = 0;
 	u32 i, off;
 
 	for_each_member(i, t, member) {
 		const struct btf_type *member_type = btf_type_by_id(btf,
 								    member->type);
 
-		if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+		if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
 			continue;
 
 		off = __btf_member_bit_offset(t, member);
@@ -3212,20 +3247,38 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
 		switch (field_type) {
 		case BTF_FIELD_SPIN_LOCK:
 		case BTF_FIELD_TIMER:
-			return btf_find_struct(btf, member_type, off, sz, info);
+			ret = btf_find_struct(btf, member_type, off, sz,
+					      idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
+		case BTF_FIELD_KPTR:
+			ret = btf_find_kptr(btf, member_type, off, sz,
+					    idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			return -EFAULT;
 		}
+
+		if (ret == BTF_FIELD_IGNORE)
+			continue;
+		if (idx >= info_cnt)
+			return -E2BIG;
+		++idx;
 	}
-	return 0;
+	return idx;
 }
 
 static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 				const char *name, int sz, int align,
 				enum btf_field_type field_type,
-				struct btf_field_info *info)
+				struct btf_field_info *info, int info_cnt)
 {
 	const struct btf_var_secinfo *vsi;
+	struct btf_field_info tmp;
+	int ret, idx = 0;
 	u32 i, off;
 
 	for_each_vsi(i, t, vsi) {
@@ -3234,7 +3287,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 
 		off = vsi->offset;
 
-		if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+		if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
 			continue;
 		if (vsi->size != sz)
 			continue;
@@ -3244,17 +3297,33 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		switch (field_type) {
 		case BTF_FIELD_SPIN_LOCK:
 		case BTF_FIELD_TIMER:
-			return btf_find_struct(btf, var_type, off, sz, info);
+			ret = btf_find_struct(btf, var_type, off, sz,
+					      idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
+		case BTF_FIELD_KPTR:
+			ret = btf_find_kptr(btf, var_type, off, sz,
+					    idx < info_cnt ? &info[idx] : &tmp);
+			if (ret < 0)
+				return ret;
+			break;
 		default:
 			return -EFAULT;
 		}
+
+		if (ret == BTF_FIELD_IGNORE)
+			continue;
+		if (idx >= info_cnt)
+			return -E2BIG;
+		++idx;
 	}
-	return 0;
+	return idx;
 }
 
 static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 			  enum btf_field_type field_type,
-			  struct btf_field_info *info)
+			  struct btf_field_info *info, int info_cnt)
 {
 	const char *name;
 	int sz, align;
@@ -3270,14 +3339,19 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 		sz = sizeof(struct bpf_timer);
 		align = __alignof__(struct bpf_timer);
 		break;
+	case BTF_FIELD_KPTR:
+		name = NULL;
+		sz = sizeof(u64);
+		align = 8;
+		break;
 	default:
 		return -EFAULT;
 	}
 
 	if (__btf_type_is_struct(t))
-		return btf_find_struct_field(btf, t, name, sz, align, field_type, info);
+		return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
 	else if (btf_type_is_datasec(t))
-		return btf_find_datasec_var(btf, t, name, sz, align, field_type, info);
+		return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
 	return -EINVAL;
 }
 
@@ -3287,26 +3361,77 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
  */
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
 {
-	struct btf_field_info info = { .off = -ENOENT };
+	struct btf_field_info info;
 	int ret;
 
-	ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info);
+	ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
 	if (ret < 0)
 		return ret;
+	if (!ret)
+		return -ENOENT;
 	return info.off;
 }
 
 int btf_find_timer(const struct btf *btf, const struct btf_type *t)
 {
-	struct btf_field_info info = { .off = -ENOENT };
+	struct btf_field_info info;
 	int ret;
 
-	ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info);
+	ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
 	if (ret < 0)
 		return ret;
+	if (!ret)
+		return -ENOENT;
 	return info.off;
 }
 
+struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
+					  const struct btf_type *t)
+{
+	struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
+	struct bpf_map_value_off *tab;
+	struct btf *kernel_btf = NULL;
+	int ret, i, nr_off;
+
+	ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (!ret)
+		return NULL;
+
+	nr_off = ret;
+	tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
+	if (!tab)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < nr_off; i++) {
+		const struct btf_type *t;
+		s32 id;
+
+		/* Find type in map BTF, and use it to look up the matching type
+		 * in vmlinux or module BTFs, by name and kind.
+		 */
+		t = btf_type_by_id(btf, info_arr[i].type_id);
+		id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+				     &kernel_btf);
+		if (id < 0) {
+			ret = id;
+			goto end;
+		}
+
+		tab->off[i].offset = info_arr[i].off;
+		tab->off[i].kptr.btf_id = id;
+		tab->off[i].kptr.btf = kernel_btf;
+	}
+	tab->nr_off = nr_off;
+	return tab;
+end:
+	while (i--)
+		btf_put(tab->off[i].kptr.btf);
+	kfree(tab);
+	return ERR_PTR(ret);
+}
+
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 5cd8f5277279..135205d0d560 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	inner_map_meta->max_entries = inner_map->max_entries;
 	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
 	inner_map_meta->timer_off = inner_map->timer_off;
+	inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
 	if (inner_map->btf) {
 		btf_get(inner_map->btf);
 		inner_map_meta->btf = inner_map->btf;
@@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
 {
+	bpf_map_free_kptr_off_tab(map_meta);
 	btf_put(map_meta->btf);
 	kfree(map_meta);
 }
@@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 		meta0->key_size == meta1->key_size &&
 		meta0->value_size == meta1->value_size &&
 		meta0->timer_off == meta1->timer_off &&
-		meta0->map_flags == meta1->map_flags;
+		meta0->map_flags == meta1->map_flags &&
+		bpf_map_equal_kptr_off_tab(meta0, meta1);
 }
 
 void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e9e3e49c0eb7..575b09339360 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6,6 +6,7 @@
 #include <linux/bpf_trace.h>
 #include <linux/bpf_lirc.h>
 #include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -473,12 +474,84 @@ static void bpf_map_release_memcg(struct bpf_map *map)
 }
 #endif
 
+static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+{
+	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+
+	if (off_desc1->offset < off_desc2->offset)
+		return -1;
+	else if (off_desc1->offset > off_desc2->offset)
+		return 1;
+	return 0;
+}
+
+struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+{
+	/* Since members are iterated in btf_find_field in increasing order,
+	 * offsets appended to kptr_off_tab are in increasing order, so we can
+	 * do bsearch to find exact match.
+	 */
+	struct bpf_map_value_off *tab;
+
+	if (!map_value_has_kptrs(map))
+		return NULL;
+	tab = map->kptr_off_tab;
+	return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+}
+
+void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+{
+	struct bpf_map_value_off *tab = map->kptr_off_tab;
+	int i;
+
+	if (!map_value_has_kptrs(map))
+		return;
+	for (i = 0; i < tab->nr_off; i++)
+		btf_put(tab->off[i].kptr.btf);
+	kfree(tab);
+	map->kptr_off_tab = NULL;
+}
+
+struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+{
+	struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
+	int size, i;
+
+	if (!map_value_has_kptrs(map))
+		return ERR_PTR(-ENOENT);
+	size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
+	new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
+	if (!new_tab)
+		return ERR_PTR(-ENOMEM);
+	/* Do a deep copy of the kptr_off_tab */
+	for (i = 0; i < tab->nr_off; i++)
+		btf_get(tab->off[i].kptr.btf);
+	return new_tab;
+}
+
+bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+{
+	struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
+	bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+	int size;
+
+	if (!a_has_kptr && !b_has_kptr)
+		return true;
+	if (a_has_kptr != b_has_kptr)
+		return false;
+	if (tab_a->nr_off != tab_b->nr_off)
+		return false;
+	size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
+	return !memcmp(tab_a, tab_b, size);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	security_bpf_map_free(map);
+	bpf_map_free_kptr_off_tab(map);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
@@ -640,7 +713,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 	int err;
 
 	if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
-	    map_value_has_timer(map))
+	    map_value_has_timer(map) || map_value_has_kptrs(map))
 		return -ENOTSUPP;
 
 	if (!(vma->vm_flags & VM_SHARED))
@@ -820,9 +893,33 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			return -EOPNOTSUPP;
 	}
 
-	if (map->ops->map_check_btf)
+	map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
+	if (map_value_has_kptrs(map)) {
+		if (!bpf_capable()) {
+			ret = -EPERM;
+			goto free_map_tab;
+		}
+		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+			ret = -EACCES;
+			goto free_map_tab;
+		}
+		if (map->map_type != BPF_MAP_TYPE_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_ARRAY) {
+			ret = -EOPNOTSUPP;
+			goto free_map_tab;
+		}
+	}
+
+	if (map->ops->map_check_btf) {
 		ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+		if (ret < 0)
+			goto free_map_tab;
+	}
 
+	return ret;
+free_map_tab:
+	bpf_map_free_kptr_off_tab(map);
 	return ret;
 }
 
@@ -1639,7 +1736,7 @@ static int map_freeze(const union bpf_attr *attr)
 		return PTR_ERR(map);
 
 	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
-	    map_value_has_timer(map)) {
+	    map_value_has_timer(map) || map_value_has_kptrs(map)) {
 		fdput(f);
 		return -ENOTSUPP;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71827d14724a..17ca586e0585 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3211,7 +3211,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	return 0;
 }
 
-enum stack_access_src {
+enum bpf_access_src {
 	ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
 	ACCESS_HELPER = 2,  /* the access is performed by a helper */
 };
@@ -3219,7 +3219,7 @@ enum stack_access_src {
 static int check_stack_range_initialized(struct bpf_verifier_env *env,
 					 int regno, int off, int access_size,
 					 bool zero_size_allowed,
-					 enum stack_access_src type,
+					 enum bpf_access_src type,
 					 struct bpf_call_arg_meta *meta);
 
 static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -3507,9 +3507,109 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, false);
 }
 
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+			       struct bpf_map_value_off_desc *off_desc,
+			       struct bpf_reg_state *reg, u32 regno)
+{
+	const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
+	const char *reg_name = "";
+
+	if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL)
+		goto bad_type;
+
+	if (!btf_is_kernel(reg->btf)) {
+		verbose(env, "R%d must point to kernel BTF\n", regno);
+		return -EINVAL;
+	}
+	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
+	reg_name = kernel_type_name(reg->btf, reg->btf_id);
+
+	if (__check_ptr_off_reg(env, reg, regno, true))
+		return -EACCES;
+
+	/* A full type match is needed, as BTF can be vmlinux or module BTF, and
+	 * we also need to take into account the reg->off.
+	 *
+	 * We want to support cases like:
+	 *
+	 * struct foo {
+	 *         struct bar br;
+	 *         struct baz bz;
+	 * };
+	 *
+	 * struct foo *v;
+	 * v = func();	      // PTR_TO_BTF_ID
+	 * val->foo = v;      // reg->off is zero, btf and btf_id match type
+	 * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+	 *                    // first member type of struct after comparison fails
+	 * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+	 *                    // to match type
+	 *
+	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+	 * is zero.
+	 */
+	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+				  off_desc->kptr.btf, off_desc->kptr.btf_id))
+		goto bad_type;
+	return 0;
+bad_type:
+	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+		reg_type_str(env, reg->type), reg_name);
+	verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+	return -EINVAL;
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+				 int value_regno, int insn_idx,
+				 struct bpf_map_value_off_desc *off_desc)
+{
+	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+	int class = BPF_CLASS(insn->code);
+	struct bpf_reg_state *val_reg;
+
+	/* Things we already checked for in check_map_access and caller:
+	 *  - Reject cases where variable offset may touch kptr
+	 *  - size of access (must be BPF_DW)
+	 *  - tnum_is_const(reg->var_off)
+	 *  - off_desc->offset == off + reg->var_off.value
+	 */
+	/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+	if (BPF_MODE(insn->code) != BPF_MEM) {
+		verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+		return -EACCES;
+	}
+
+	if (class == BPF_LDX) {
+		val_reg = reg_state(env, value_regno);
+		/* We can simply mark the value_regno receiving the pointer
+		 * value from map as PTR_TO_BTF_ID, with the correct type.
+		 */
+		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
+				off_desc->kptr.btf_id, PTR_MAYBE_NULL);
+		/* For mark_ptr_or_null_reg */
+		val_reg->id = ++env->id_gen;
+	} else if (class == BPF_STX) {
+		val_reg = reg_state(env, value_regno);
+		if (!register_is_null(val_reg) &&
+		    map_kptr_match_type(env, off_desc, val_reg, value_regno))
+			return -EACCES;
+	} else if (class == BPF_ST) {
+		if (insn->imm) {
+			verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+				off_desc->offset);
+			return -EACCES;
+		}
+	} else {
+		verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-			    int off, int size, bool zero_size_allowed)
+			    int off, int size, bool zero_size_allowed,
+			    enum bpf_access_src src)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
@@ -3545,6 +3645,36 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 	}
+	if (map_value_has_kptrs(map)) {
+		struct bpf_map_value_off *tab = map->kptr_off_tab;
+		int i;
+
+		for (i = 0; i < tab->nr_off; i++) {
+			u32 p = tab->off[i].offset;
+
+			if (reg->smin_value + off < p + sizeof(u64) &&
+			    p < reg->umax_value + off + size) {
+				if (src != ACCESS_DIRECT) {
+					verbose(env, "kptr cannot be accessed indirectly by helper\n");
+					return -EACCES;
+				}
+				if (!tnum_is_const(reg->var_off)) {
+					verbose(env, "kptr access cannot have variable offset\n");
+					return -EACCES;
+				}
+				if (p != off + reg->var_off.value) {
+					verbose(env, "kptr access misaligned expected=%u off=%llu\n",
+						p, off + reg->var_off.value);
+					return -EACCES;
+				}
+				if (size != bpf_size_to_bytes(BPF_DW)) {
+					verbose(env, "kptr access size must be BPF_DW\n");
+					return -EACCES;
+				}
+				break;
+			}
+		}
+	}
 	return err;
 }
 
@@ -4316,7 +4446,7 @@ static int check_stack_slot_within_bounds(int off,
 static int check_stack_access_within_bounds(
 		struct bpf_verifier_env *env,
 		int regno, int off, int access_size,
-		enum stack_access_src src, enum bpf_access_type type)
+		enum bpf_access_src src, enum bpf_access_type type)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = regs + regno;
@@ -4412,6 +4542,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_MAP_VALUE) {
+		struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4420,8 +4552,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_map_access_type(env, regno, off, size, t);
 		if (err)
 			return err;
-		err = check_map_access(env, regno, off, size, false);
-		if (!err && t == BPF_READ && value_regno >= 0) {
+		err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+		if (err)
+			return err;
+		if (tnum_is_const(reg->var_off))
+			kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
+								  off + reg->var_off.value);
+		if (kptr_off_desc) {
+			err = check_map_kptr_access(env, regno, value_regno, insn_idx,
+						    kptr_off_desc);
+		} else if (t == BPF_READ && value_regno >= 0) {
 			struct bpf_map *map = reg->map_ptr;
 
 			/* if map is read-only, track its contents as scalars */
@@ -4724,7 +4864,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 static int check_stack_range_initialized(
 		struct bpf_verifier_env *env, int regno, int off,
 		int access_size, bool zero_size_allowed,
-		enum stack_access_src type, struct bpf_call_arg_meta *meta)
+		enum bpf_access_src type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_func_state *state = func(env, reg);
@@ -4874,7 +5014,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 					  BPF_READ))
 			return -EACCES;
 		return check_map_access(env, regno, reg->off, access_size,
-					zero_size_allowed);
+					zero_size_allowed, ACCESS_HELPER);
 	case PTR_TO_MEM:
 		if (type_is_rdonly_mem(reg->type)) {
 			if (meta && meta->raw_mode) {
@@ -5642,7 +5782,8 @@ skip_type_check:
 		}
 
 		err = check_map_access(env, regno, reg->off,
-				       map->value_size - reg->off, false);
+				       map->value_size - reg->off, false,
+				       ACCESS_HELPER);
 		if (err)
 			return err;
 
@@ -7462,7 +7603,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
 			return -EACCES;
 		break;
 	case PTR_TO_MAP_VALUE:
-		if (check_map_access(env, dst, dst_reg->off, 1, false)) {
+		if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
 			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
 				"prohibited for !root\n", dst);
 			return -EACCES;
-- 
cgit v1.2.3


From 8f14852e89113d738c99c375b4c8b8b7e1073df1 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:50 +0530
Subject: bpf: Tag argument to be released in bpf_func_proto

Add a new type flag for bpf_arg_type that when set tells verifier that
for a release function, that argument's register will be the one for
which meta.ref_obj_id will be set, and which will then be released
using release_reference. To capture the regno, introduce a new field
release_regno in bpf_call_arg_meta.

This would be required in the next patch, where we may either pass NULL
or a refcounted pointer as an argument to the release function
bpf_kptr_xchg. Just releasing only when meta.ref_obj_id is set is not
enough, as there is a case where the type of argument needed matches,
but the ref_obj_id is set to 0. Hence, we must enforce that whenever
meta.ref_obj_id is zero, the register that is to be released can only
be NULL for a release function.

Since we now indicate whether an argument is to be released in
bpf_func_proto itself, is_release_function helper has lost its utitlity,
hence refactor code to work without it, and just rely on
meta.release_regno to know when to release state for a ref_obj_id.
Still, the restriction of one release argument and only one ref_obj_id
passed to BPF helper or kfunc remains. This may be lifted in the future.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-3-memxor@gmail.com
---
 kernel/bpf/btf.c      | 11 ++++----
 kernel/bpf/ringbuf.c  |  4 +--
 kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++++++----------------------
 3 files changed, 50 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 563ac61e6d6b..f0287342204f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6047,6 +6047,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	 * verifier sees.
 	 */
 	for (i = 0; i < nargs; i++) {
+		enum bpf_arg_type arg_type = ARG_DONTCARE;
 		u32 regno = i + 1;
 		struct bpf_reg_state *reg = &regs[regno];
 
@@ -6067,7 +6068,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
-		ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
+		if (rel && reg->ref_obj_id)
+			arg_type |= OBJ_RELEASE;
+		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
 		if (ret < 0)
 			return ret;
 
@@ -6099,11 +6102,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (reg->type == PTR_TO_BTF_ID) {
 				reg_btf = reg->btf;
 				reg_ref_id = reg->btf_id;
-				/* Ensure only one argument is referenced
-				 * PTR_TO_BTF_ID, check_func_arg_reg_off relies
-				 * on only one referenced register being allowed
-				 * for kfuncs.
-				 */
+				/* Ensure only one argument is referenced PTR_TO_BTF_ID */
 				if (reg->ref_obj_id) {
 					if (ref_obj_id) {
 						bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 710ba9de12ce..5173fd37590f 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -404,7 +404,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
 const struct bpf_func_proto bpf_ringbuf_submit_proto = {
 	.func		= bpf_ringbuf_submit,
 	.ret_type	= RET_VOID,
-	.arg1_type	= ARG_PTR_TO_ALLOC_MEM,
+	.arg1_type	= ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
 
@@ -417,7 +417,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
 const struct bpf_func_proto bpf_ringbuf_discard_proto = {
 	.func		= bpf_ringbuf_discard,
 	.ret_type	= RET_VOID,
-	.arg1_type	= ARG_PTR_TO_ALLOC_MEM,
+	.arg1_type	= ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 17ca586e0585..5426bab7f02c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@ struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
 	bool pkt_access;
+	u8 release_regno;
 	int regno;
 	int access_size;
 	int mem_size;
@@ -471,17 +472,6 @@ static bool type_may_be_null(u32 type)
 	return type & PTR_MAYBE_NULL;
 }
 
-/* Determine whether the function releases some resources allocated by another
- * function call. The first reference type argument will be assumed to be
- * released by release_reference().
- */
-static bool is_release_function(enum bpf_func_id func_id)
-{
-	return func_id == BPF_FUNC_sk_release ||
-	       func_id == BPF_FUNC_ringbuf_submit ||
-	       func_id == BPF_FUNC_ringbuf_discard;
-}
-
 static bool may_be_acquire_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_lookup_tcp ||
@@ -5326,6 +5316,11 @@ static bool arg_type_is_int_ptr(enum bpf_arg_type type)
 	       type == ARG_PTR_TO_LONG;
 }
 
+static bool arg_type_is_release(enum bpf_arg_type type)
+{
+	return type & OBJ_RELEASE;
+}
+
 static int int_ptr_type_to_size(enum bpf_arg_type type)
 {
 	if (type == ARG_PTR_TO_INT)
@@ -5536,11 +5531,10 @@ found:
 
 int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type,
-			   bool is_release_func)
+			   enum bpf_arg_type arg_type)
 {
-	bool fixed_off_ok = false, release_reg;
 	enum bpf_reg_type type = reg->type;
+	bool fixed_off_ok = false;
 
 	switch ((u32)type) {
 	case SCALAR_VALUE:
@@ -5558,7 +5552,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 		/* Some of the argument types nevertheless require a
 		 * zero register offset.
 		 */
-		if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+		if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
 			return 0;
 		break;
 	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
@@ -5566,19 +5560,17 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	 */
 	case PTR_TO_BTF_ID:
 		/* When referenced PTR_TO_BTF_ID is passed to release function,
-		 * it's fixed offset must be 0. We rely on the property that
-		 * only one referenced register can be passed to BPF helpers and
-		 * kfuncs. In the other cases, fixed offset can be non-zero.
+		 * it's fixed offset must be 0.	In the other cases, fixed offset
+		 * can be non-zero.
 		 */
-		release_reg = is_release_func && reg->ref_obj_id;
-		if (release_reg && reg->off) {
+		if (arg_type_is_release(arg_type) && reg->off) {
 			verbose(env, "R%d must have zero offset when passed to release func\n",
 				regno);
 			return -EINVAL;
 		}
-		/* For release_reg == true, fixed_off_ok must be false, but we
-		 * already checked and rejected reg->off != 0 above, so set to
-		 * true to allow fixed offset for all other cases.
+		/* For arg is release pointer, fixed_off_ok must be false, but
+		 * we already checked and rejected reg->off != 0 above, so set
+		 * to true to allow fixed offset for all other cases.
 		 */
 		fixed_off_ok = true;
 		break;
@@ -5637,14 +5629,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	if (err)
 		return err;
 
-	err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
+	err = check_func_arg_reg_off(env, reg, regno, arg_type);
 	if (err)
 		return err;
 
 skip_type_check:
-	/* check_func_arg_reg_off relies on only one referenced register being
-	 * allowed for BPF helpers.
-	 */
+	if (arg_type_is_release(arg_type)) {
+		if (!reg->ref_obj_id && !register_is_null(reg)) {
+			verbose(env, "R%d must be referenced when passed to release function\n",
+				regno);
+			return -EINVAL;
+		}
+		if (meta->release_regno) {
+			verbose(env, "verifier internal error: more than one release argument\n");
+			return -EFAULT;
+		}
+		meta->release_regno = regno;
+	}
+
 	if (reg->ref_obj_id) {
 		if (meta->ref_obj_id) {
 			verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -6151,7 +6153,8 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
-static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id,
+			    struct bpf_call_arg_meta *meta)
 {
 	return check_raw_mode_ok(fn) &&
 	       check_arg_pair_ok(fn) &&
@@ -6835,7 +6838,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
 
-	err = check_func_proto(fn, func_id);
+	err = check_func_proto(fn, func_id, &meta);
 	if (err) {
 		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
@@ -6868,8 +6871,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return err;
 	}
 
-	if (is_release_function(func_id)) {
-		err = release_reference(env, meta.ref_obj_id);
+	regs = cur_regs(env);
+
+	if (meta.release_regno) {
+		err = -EINVAL;
+		if (meta.ref_obj_id)
+			err = release_reference(env, meta.ref_obj_id);
+		/* meta.ref_obj_id can only be 0 if register that is meant to be
+		 * released is NULL, which must be > R0.
+		 */
+		else if (register_is_null(&regs[meta.release_regno]))
+			err = 0;
 		if (err) {
 			verbose(env, "func %s#%d reference has not been acquired before\n",
 				func_id_name(func_id), func_id);
@@ -6877,8 +6889,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 	}
 
-	regs = cur_regs(env);
-
 	switch (func_id) {
 	case BPF_FUNC_tail_call:
 		err = check_reference_leak(env);
-- 
cgit v1.2.3


From c0a5a21c25f37c9fd7b36072f9968cdff1e4aa13 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:51 +0530
Subject: bpf: Allow storing referenced kptr in map

Extending the code in previous commits, introduce referenced kptr
support, which needs to be tagged using 'kptr_ref' tag instead. Unlike
unreferenced kptr, referenced kptr have a lot more restrictions. In
addition to the type matching, only a newly introduced bpf_kptr_xchg
helper is allowed to modify the map value at that offset. This transfers
the referenced pointer being stored into the map, releasing the
references state for the program, and returning the old value and
creating new reference state for the returned pointer.

Similar to unreferenced pointer case, return value for this case will
also be PTR_TO_BTF_ID_OR_NULL. The reference for the returned pointer
must either be eventually released by calling the corresponding release
function, otherwise it must be transferred into another map.

It is also allowed to call bpf_kptr_xchg with a NULL pointer, to clear
the value, and obtain the old value if any.

BPF_LDX, BPF_STX, and BPF_ST cannot access referenced kptr. A future
commit will permit using BPF_LDX for such pointers, but attempt at
making it safe, since the lifetime of object won't be guaranteed.

There are valid reasons to enforce the restriction of permitting only
bpf_kptr_xchg to operate on referenced kptr. The pointer value must be
consistent in face of concurrent modification, and any prior values
contained in the map must also be released before a new one is moved
into the map. To ensure proper transfer of this ownership, bpf_kptr_xchg
returns the old value, which the verifier would require the user to
either free or move into another map, and releases the reference held
for the pointer being moved in.

In the future, direct BPF_XCHG instruction may also be permitted to work
like bpf_kptr_xchg helper.

Note that process_kptr_func doesn't have to call
check_helper_mem_access, since we already disallow rdonly/wronly flags
for map, which is what check_map_access_type checks, and we already
ensure the PTR_TO_MAP_VALUE refers to kptr by obtaining its off_desc,
so check_map_access is also not required.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-4-memxor@gmail.com
---
 kernel/bpf/btf.c      | 10 +++++-
 kernel/bpf/helpers.c  | 24 +++++++++++++
 kernel/bpf/verifier.c | 98 ++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 119 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f0287342204f..4138c51728dd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3177,6 +3177,7 @@ enum {
 struct btf_field_info {
 	u32 type_id;
 	u32 off;
+	enum bpf_kptr_type type;
 };
 
 static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
@@ -3193,6 +3194,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
 static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 			 u32 off, int sz, struct btf_field_info *info)
 {
+	enum bpf_kptr_type type;
 	u32 res_id;
 
 	/* For PTR, sz is always == 8 */
@@ -3205,7 +3207,11 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	/* Reject extra tags */
 	if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
 		return -EINVAL;
-	if (strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+	if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_UNREF;
+	else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_REF;
+	else
 		return -EINVAL;
 
 	/* Get the base type */
@@ -3216,6 +3222,7 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 
 	info->type_id = res_id;
 	info->off = off;
+	info->type = type;
 	return BTF_FIELD_FOUND;
 }
 
@@ -3420,6 +3427,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 		}
 
 		tab->off[i].offset = info_arr[i].off;
+		tab->off[i].type = info_arr[i].type;
 		tab->off[i].kptr.btf_id = id;
 		tab->off[i].kptr.btf = kernel_btf;
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 315053ef6a75..3e709fed5306 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1374,6 +1374,28 @@ out:
 	kfree(t);
 }
 
+BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
+{
+	unsigned long *kptr = map_value;
+
+	return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier.
+ */
+#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
+
+const struct bpf_func_proto bpf_kptr_xchg_proto = {
+	.func         = bpf_kptr_xchg,
+	.gpl_only     = false,
+	.ret_type     = RET_PTR_TO_BTF_ID_OR_NULL,
+	.ret_btf_id   = BPF_PTR_POISON,
+	.arg1_type    = ARG_PTR_TO_KPTR,
+	.arg2_type    = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+	.arg2_btf_id  = BPF_PTR_POISON,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1452,6 +1474,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_timer_start_proto;
 	case BPF_FUNC_timer_cancel:
 		return &bpf_timer_cancel_proto;
+	case BPF_FUNC_kptr_xchg:
+		return &bpf_kptr_xchg_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5426bab7f02c..c9ee44efed89 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -258,6 +258,7 @@ struct bpf_call_arg_meta {
 	struct btf *ret_btf;
 	u32 ret_btf_id;
 	u32 subprogno;
+	struct bpf_map_value_off_desc *kptr_off_desc;
 };
 
 struct btf *btf_vmlinux;
@@ -489,7 +490,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
 	if (func_id == BPF_FUNC_sk_lookup_tcp ||
 	    func_id == BPF_FUNC_sk_lookup_udp ||
 	    func_id == BPF_FUNC_skc_lookup_tcp ||
-	    func_id == BPF_FUNC_ringbuf_reserve)
+	    func_id == BPF_FUNC_ringbuf_reserve ||
+	    func_id == BPF_FUNC_kptr_xchg)
 		return true;
 
 	if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -3514,6 +3516,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	/* We need to verify reg->type and reg->btf, before accessing reg->btf */
 	reg_name = kernel_type_name(reg->btf, reg->btf_id);
 
+	/* For ref_ptr case, release function check should ensure we get one
+	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+	 * normal store of unreferenced kptr, we must ensure var_off is zero.
+	 * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+	 * reg->off and reg->ref_obj_id are not needed here.
+	 */
 	if (__check_ptr_off_reg(env, reg, regno, true))
 		return -EACCES;
 
@@ -3569,6 +3577,12 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
+	/* We cannot directly access kptr_ref */
+	if (off_desc->type == BPF_KPTR_REF) {
+		verbose(env, "accessing referenced kptr disallowed\n");
+		return -EACCES;
+	}
+
 	if (class == BPF_LDX) {
 		val_reg = reg_state(env, value_regno);
 		/* We can simply mark the value_regno receiving the pointer
@@ -5293,6 +5307,53 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+			     struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	struct bpf_map_value_off_desc *off_desc;
+	struct bpf_map *map_ptr = reg->map_ptr;
+	u32 kptr_off;
+	int ret;
+
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env,
+			"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+			regno);
+		return -EINVAL;
+	}
+	if (!map_ptr->btf) {
+		verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+			map_ptr->name);
+		return -EINVAL;
+	}
+	if (!map_value_has_kptrs(map_ptr)) {
+		ret = PTR_ERR(map_ptr->kptr_off_tab);
+		if (ret == -E2BIG)
+			verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
+				BPF_MAP_VALUE_OFF_MAX);
+		else if (ret == -EEXIST)
+			verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
+		else
+			verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+		return -EINVAL;
+	}
+
+	meta->map_ptr = map_ptr;
+	kptr_off = reg->off + reg->var_off.value;
+	off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
+	if (!off_desc) {
+		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+		return -EACCES;
+	}
+	if (off_desc->type != BPF_KPTR_REF) {
+		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
+		return -EACCES;
+	}
+	meta->kptr_off_desc = off_desc;
+	return 0;
+}
+
 static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
 {
 	return base_type(type) == ARG_PTR_TO_MEM ||
@@ -5433,6 +5494,7 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -5460,11 +5522,13 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_STACK]		= &stack_ptr_types,
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
+	[ARG_PTR_TO_KPTR]		= &kptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
-			  const u32 *arg_btf_id)
+			  const u32 *arg_btf_id,
+			  struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected, type = reg->type;
@@ -5517,8 +5581,11 @@ found:
 			arg_btf_id = compatible->btf_id;
 		}
 
-		if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-					  btf_vmlinux, *arg_btf_id)) {
+		if (meta->func_id == BPF_FUNC_kptr_xchg) {
+			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+				return -EACCES;
+		} else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+						 btf_vmlinux, *arg_btf_id)) {
 			verbose(env, "R%d is of type %s but %s is expected\n",
 				regno, kernel_type_name(reg->btf, reg->btf_id),
 				kernel_type_name(btf_vmlinux, *arg_btf_id));
@@ -5625,7 +5692,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		 */
 		goto skip_type_check;
 
-	err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]);
+	err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
 	if (err)
 		return err;
 
@@ -5801,6 +5868,9 @@ skip_type_check:
 			verbose(env, "string is not zero-terminated\n");
 			return -EINVAL;
 		}
+	} else if (arg_type == ARG_PTR_TO_KPTR) {
+		if (process_kptr_func(env, regno, meta))
+			return -EACCES;
 	}
 
 	return err;
@@ -6143,10 +6213,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
-		if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
+		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
 			return false;
 
-		if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
 			return false;
 	}
 
@@ -7012,21 +7082,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
 	} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+		struct btf *ret_btf;
 		int ret_btf_id;
 
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
-		ret_btf_id = *fn->ret_btf_id;
+		if (func_id == BPF_FUNC_kptr_xchg) {
+			ret_btf = meta.kptr_off_desc->kptr.btf;
+			ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+		} else {
+			ret_btf = btf_vmlinux;
+			ret_btf_id = *fn->ret_btf_id;
+		}
 		if (ret_btf_id == 0) {
 			verbose(env, "invalid return type %u of func %s#%d\n",
 				base_type(ret_type), func_id_name(func_id),
 				func_id);
 			return -EINVAL;
 		}
-		/* current BPF helper definitions are only coming from
-		 * built-in code with type IDs from  vmlinux BTF
-		 */
-		regs[BPF_REG_0].btf = btf_vmlinux;
+		regs[BPF_REG_0].btf = ret_btf;
 		regs[BPF_REG_0].btf_id = ret_btf_id;
 	} else {
 		verbose(env, "unknown return type %u of func %s#%d\n",
-- 
cgit v1.2.3


From 6efe152d4061a83a2c5db6a0e5b58f9345c9742f Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:52 +0530
Subject: bpf: Prevent escaping of kptr loaded from maps

While we can guarantee that even for unreferenced kptr, the object
pointer points to being freed etc. can be handled by the verifier's
exception handling (normal load patching to PROBE_MEM loads), we still
cannot allow the user to pass these pointers to BPF helpers and kfunc,
because the same exception handling won't be done for accesses inside
the kernel. The same is true if a referenced pointer is loaded using
normal load instruction. Since the reference is not guaranteed to be
held while the pointer is used, it must be marked as untrusted.

Hence introduce a new type flag, PTR_UNTRUSTED, which is used to mark
all registers loading unreferenced and referenced kptr from BPF maps,
and ensure they can never escape the BPF program and into the kernel by
way of calling stable/unstable helpers.

In check_ptr_to_btf_access, the !type_may_be_null check to reject type
flags is still correct, as apart from PTR_MAYBE_NULL, only MEM_USER,
MEM_PERCPU, and PTR_UNTRUSTED may be set for PTR_TO_BTF_ID. The first
two are checked inside the function and rejected using a proper error
message, but we still want to allow dereference of untrusted case.

Also, we make sure to inherit PTR_UNTRUSTED when chain of pointers are
walked, so that this flag is never dropped once it has been set on a
PTR_TO_BTF_ID (i.e. trusted to untrusted transition can only be in one
direction).

In convert_ctx_accesses, extend the switch case to consider untrusted
PTR_TO_BTF_ID in addition to normal PTR_TO_BTF_ID for PROBE_MEM
conversion for BPF_LDX.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-5-memxor@gmail.com
---
 kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c9ee44efed89..955c3125576a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -567,6 +567,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		strncpy(prefix, "user_", 32);
 	if (type & MEM_PERCPU)
 		strncpy(prefix, "percpu_", 32);
+	if (type & PTR_UNTRUSTED)
+		strncpy(prefix, "untrusted_", 32);
 
 	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
 		 prefix, str[base_type(type)], postfix);
@@ -3504,9 +3506,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, u32 regno)
 {
 	const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
+	int perm_flags = PTR_MAYBE_NULL;
 	const char *reg_name = "";
 
-	if (base_type(reg->type) != PTR_TO_BTF_ID || type_flag(reg->type) != PTR_MAYBE_NULL)
+	/* Only unreferenced case accepts untrusted pointers */
+	if (off_desc->type == BPF_KPTR_UNREF)
+		perm_flags |= PTR_UNTRUSTED;
+
+	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
 		goto bad_type;
 
 	if (!btf_is_kernel(reg->btf)) {
@@ -3553,7 +3560,12 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 bad_type:
 	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
 		reg_type_str(env, reg->type), reg_name);
-	verbose(env, "expected=%s%s\n", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+	verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+	if (off_desc->type == BPF_KPTR_UNREF)
+		verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+			targ_name);
+	else
+		verbose(env, "\n");
 	return -EINVAL;
 }
 
@@ -3577,9 +3589,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
-	/* We cannot directly access kptr_ref */
-	if (off_desc->type == BPF_KPTR_REF) {
-		verbose(env, "accessing referenced kptr disallowed\n");
+	/* We only allow loading referenced kptr, since it will be marked as
+	 * untrusted, similar to unreferenced kptr.
+	 */
+	if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+		verbose(env, "store to referenced kptr disallowed\n");
 		return -EACCES;
 	}
 
@@ -3589,7 +3603,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
-				off_desc->kptr.btf_id, PTR_MAYBE_NULL);
+				off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -4358,6 +4372,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
+	/* If this is an untrusted pointer, all pointers formed by walking it
+	 * also inherit the untrusted flag.
+	 */
+	if (type_flag(reg->type) & PTR_UNTRUSTED)
+		flag |= PTR_UNTRUSTED;
+
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
@@ -13076,7 +13096,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (!ctx_access)
 			continue;
 
-		switch (env->insn_aux_data[i + delta].ptr_type) {
+		switch ((int)env->insn_aux_data[i + delta].ptr_type) {
 		case PTR_TO_CTX:
 			if (!ops->convert_ctx_access)
 				continue;
@@ -13093,6 +13113,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
 			break;
 		case PTR_TO_BTF_ID:
+		case PTR_TO_BTF_ID | PTR_UNTRUSTED:
 			if (type == BPF_READ) {
 				insn->code = BPF_LDX | BPF_PROBE_MEM |
 					BPF_SIZE((insn)->code);
-- 
cgit v1.2.3


From 4d7d7f69f4b104b2ddeec6a1e7fcfd2d044ed8c4 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:53 +0530
Subject: bpf: Adapt copy_map_value for multiple offset case

Since now there might be at most 10 offsets that need handling in
copy_map_value, the manual shuffling and special case is no longer going
to work. Hence, let's generalise the copy_map_value function by using
a sorted array of offsets to skip regions that must be avoided while
copying into and out of a map value.

When the map is created, we populate the offset array in struct map,
Then, copy_map_value uses this sorted offset array is used to memcpy
while skipping timer, spin lock, and kptr. The array is allocated as
in most cases none of these special fields would be present in map
value, hence we can save on space for the common case by not embedding
the entire object inside bpf_map struct.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-6-memxor@gmail.com
---
 kernel/bpf/syscall.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 575b09339360..4d419a3effc4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -30,6 +30,7 @@
 #include <linux/pgtable.h>
 #include <linux/bpf_lsm.h>
 #include <linux/poll.h>
+#include <linux/sort.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
@@ -551,6 +552,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	security_bpf_map_free(map);
+	kfree(map->off_arr);
 	bpf_map_free_kptr_off_tab(map);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
@@ -840,6 +842,84 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
+static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
+{
+	const u32 a = *(const u32 *)_a;
+	const u32 b = *(const u32 *)_b;
+
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	return 0;
+}
+
+static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
+{
+	struct bpf_map *map = (struct bpf_map *)priv;
+	u32 *off_base = map->off_arr->field_off;
+	u32 *a = _a, *b = _b;
+	u8 *sz_a, *sz_b;
+
+	sz_a = map->off_arr->field_sz + (a - off_base);
+	sz_b = map->off_arr->field_sz + (b - off_base);
+
+	swap(*a, *b);
+	swap(*sz_a, *sz_b);
+}
+
+static int bpf_map_alloc_off_arr(struct bpf_map *map)
+{
+	bool has_spin_lock = map_value_has_spin_lock(map);
+	bool has_timer = map_value_has_timer(map);
+	bool has_kptrs = map_value_has_kptrs(map);
+	struct bpf_map_off_arr *off_arr;
+	u32 i;
+
+	if (!has_spin_lock && !has_timer && !has_kptrs) {
+		map->off_arr = NULL;
+		return 0;
+	}
+
+	off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
+	if (!off_arr)
+		return -ENOMEM;
+	map->off_arr = off_arr;
+
+	off_arr->cnt = 0;
+	if (has_spin_lock) {
+		i = off_arr->cnt;
+
+		off_arr->field_off[i] = map->spin_lock_off;
+		off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
+		off_arr->cnt++;
+	}
+	if (has_timer) {
+		i = off_arr->cnt;
+
+		off_arr->field_off[i] = map->timer_off;
+		off_arr->field_sz[i] = sizeof(struct bpf_timer);
+		off_arr->cnt++;
+	}
+	if (has_kptrs) {
+		struct bpf_map_value_off *tab = map->kptr_off_tab;
+		u32 *off = &off_arr->field_off[off_arr->cnt];
+		u8 *sz = &off_arr->field_sz[off_arr->cnt];
+
+		for (i = 0; i < tab->nr_off; i++) {
+			*off++ = tab->off[i].offset;
+			*sz++ = sizeof(u64);
+		}
+		off_arr->cnt += tab->nr_off;
+	}
+
+	if (off_arr->cnt == 1)
+		return 0;
+	sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
+	       map_off_arr_cmp, map_off_arr_swap, map);
+	return 0;
+}
+
 static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 			 u32 btf_key_id, u32 btf_value_id)
 {
@@ -1009,10 +1089,14 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_alloc(map);
+	err = bpf_map_alloc_off_arr(map);
 	if (err)
 		goto free_map;
 
+	err = security_bpf_map_alloc(map);
+	if (err)
+		goto free_map_off_arr;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map_sec;
@@ -1035,6 +1119,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map_sec:
 	security_bpf_map_free(map);
+free_map_off_arr:
+	kfree(map->off_arr);
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
-- 
cgit v1.2.3


From 5ce937d613a423ca3102f53d9f3daf4210c1b6e2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:54 +0530
Subject: bpf: Populate pairs of btf_id and destructor kfunc in btf

To support storing referenced PTR_TO_BTF_ID in maps, we require
associating a specific BTF ID with a 'destructor' kfunc. This is because
we need to release a live referenced pointer at a certain offset in map
value from the map destruction path, otherwise we end up leaking
resources.

Hence, introduce support for passing an array of btf_id, kfunc_btf_id
pairs that denote a BTF ID and its associated release function. Then,
add an accessor 'btf_find_dtor_kfunc' which can be used to look up the
destructor kfunc of a certain BTF ID. If found, we can use it to free
the object from the map free path.

The registration of these pairs also serve as a whitelist of structures
which are allowed as referenced PTR_TO_BTF_ID in a BPF map, because
without finding the destructor kfunc, we will bail and return an error.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-7-memxor@gmail.com
---
 kernel/bpf/btf.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4138c51728dd..a0b1665a4a48 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -207,12 +207,18 @@ enum btf_kfunc_hook {
 
 enum {
 	BTF_KFUNC_SET_MAX_CNT = 32,
+	BTF_DTOR_KFUNC_MAX_CNT = 256,
 };
 
 struct btf_kfunc_set_tab {
 	struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
 };
 
+struct btf_id_dtor_kfunc_tab {
+	u32 cnt;
+	struct btf_id_dtor_kfunc dtors[];
+};
+
 struct btf {
 	void *data;
 	struct btf_type **types;
@@ -228,6 +234,7 @@ struct btf {
 	u32 id;
 	struct rcu_head rcu;
 	struct btf_kfunc_set_tab *kfunc_set_tab;
+	struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
 
 	/* split BTF support */
 	struct btf *base_btf;
@@ -1616,8 +1623,19 @@ free_tab:
 	btf->kfunc_set_tab = NULL;
 }
 
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+	struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+	if (!tab)
+		return;
+	kfree(tab);
+	btf->dtor_kfunc_tab = NULL;
+}
+
 static void btf_free(struct btf *btf)
 {
+	btf_free_dtor_kfunc_tab(btf);
 	btf_free_kfunc_set_tab(btf);
 	kvfree(btf->types);
 	kvfree(btf->resolved_sizes);
@@ -7076,6 +7094,96 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
 }
 EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
 
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+	struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+	struct btf_id_dtor_kfunc *dtor;
+
+	if (!tab)
+		return -ENOENT;
+	/* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+	 * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+	 */
+	BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+	dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+	if (!dtor)
+		return -ENOENT;
+	return dtor->kfunc_btf_id;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+				struct module *owner)
+{
+	struct btf_id_dtor_kfunc_tab *tab;
+	struct btf *btf;
+	u32 tab_cnt;
+	int ret;
+
+	btf = btf_get_module_btf(owner);
+	if (!btf) {
+		if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+			pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n");
+			return -ENOENT;
+		}
+		if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+			pr_err("missing module BTF, cannot register dtor kfuncs\n");
+			return -ENOENT;
+		}
+		return 0;
+	}
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+		pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+		ret = -E2BIG;
+		goto end;
+	}
+
+	tab = btf->dtor_kfunc_tab;
+	/* Only one call allowed for modules */
+	if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	tab_cnt = tab ? tab->cnt : 0;
+	if (tab_cnt > U32_MAX - add_cnt) {
+		ret = -EOVERFLOW;
+		goto end;
+	}
+	if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+		pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+		ret = -E2BIG;
+		goto end;
+	}
+
+	tab = krealloc(btf->dtor_kfunc_tab,
+		       offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+		       GFP_KERNEL | __GFP_NOWARN);
+	if (!tab) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	if (!btf->dtor_kfunc_tab)
+		tab->cnt = 0;
+	btf->dtor_kfunc_tab = tab;
+
+	memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+	tab->cnt += add_cnt;
+
+	sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+	return 0;
+end:
+	btf_free_dtor_kfunc_tab(btf);
+	btf_put(btf);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
 #define MAX_TYPES_ARE_COMPAT_DEPTH 2
 
 static
-- 
cgit v1.2.3


From 14a324f6a67ef6a53e04362a70160a47eb8afffa Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:55 +0530
Subject: bpf: Wire up freeing of referenced kptr

A destructor kfunc can be defined as void func(type *), where type may
be void or any other pointer type as per convenience.

In this patch, we ensure that the type is sane and capture the function
pointer into off_desc of ptr_off_tab for the specific pointer offset,
with the invariant that the dtor pointer is always set when 'kptr_ref'
tag is applied to the pointer's pointee type, which is indicated by the
flag BPF_MAP_VALUE_OFF_F_REF.

Note that only BTF IDs whose destructor kfunc is registered, thus become
the allowed BTF IDs for embedding as referenced kptr. Hence it serves
the purpose of finding dtor kfunc BTF ID, as well acting as a check
against the whitelist of allowed BTF IDs for this purpose.

Finally, wire up the actual freeing of the referenced pointer if any at
all available offsets, so that no references are leaked after the BPF
map goes away and the BPF program previously moved the ownership a
referenced pointer into it.

The behavior is similar to BPF timers, where bpf_map_{update,delete}_elem
will free any existing referenced kptr. The same case is with LRU map's
bpf_lru_push_free/htab_lru_push_free functions, which are extended to
reset unreferenced and free referenced kptr.

Note that unlike BPF timers, kptr is not reset or freed when map uref
drops to zero.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-8-memxor@gmail.com
---
 kernel/bpf/arraymap.c | 18 +++++++---
 kernel/bpf/btf.c      | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/hashtab.c  | 64 ++++++++++++++++++++++++---------
 kernel/bpf/syscall.c  | 49 +++++++++++++++++++++++---
 4 files changed, 204 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7f145aefbff8..c3de63ce574e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,10 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 	return 0;
 }
 
-static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+static void check_and_free_fields(struct bpf_array *arr, void *val)
 {
-	if (unlikely(map_value_has_timer(&arr->map)))
+	if (map_value_has_timer(&arr->map))
 		bpf_timer_cancel_and_free(val + arr->map.timer_off);
+	if (map_value_has_kptrs(&arr->map))
+		bpf_map_free_kptrs(&arr->map, val);
 }
 
 /* Called from syscall or from eBPF program */
@@ -327,7 +329,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 			copy_map_value_locked(map, val, value, false);
 		else
 			copy_map_value(map, val, value);
-		check_and_free_timer_in_array(array, val);
+		check_and_free_fields(array, val);
 	}
 	return 0;
 }
@@ -386,7 +388,8 @@ static void array_map_free_timers(struct bpf_map *map)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	if (likely(!map_value_has_timer(map)))
+	/* We don't reset or free kptr on uref dropping to zero. */
+	if (!map_value_has_timer(map))
 		return;
 
 	for (i = 0; i < array->map.max_entries; i++)
@@ -398,6 +401,13 @@ static void array_map_free_timers(struct bpf_map *map)
 static void array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	if (map_value_has_kptrs(map)) {
+		for (i = 0; i < array->map.max_entries; i++)
+			bpf_map_free_kptrs(map, array->value + array->elem_size * i);
+		bpf_map_free_kptr_off_tab(map);
+	}
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a0b1665a4a48..1f2012fd89fb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3416,6 +3416,7 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 	struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
 	struct bpf_map_value_off *tab;
 	struct btf *kernel_btf = NULL;
+	struct module *mod = NULL;
 	int ret, i, nr_off;
 
 	ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
@@ -3444,16 +3445,69 @@ struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
 			goto end;
 		}
 
+		/* Find and stash the function pointer for the destruction function that
+		 * needs to be eventually invoked from the map free path.
+		 */
+		if (info_arr[i].type == BPF_KPTR_REF) {
+			const struct btf_type *dtor_func;
+			const char *dtor_func_name;
+			unsigned long addr;
+			s32 dtor_btf_id;
+
+			/* This call also serves as a whitelist of allowed objects that
+			 * can be used as a referenced pointer and be stored in a map at
+			 * the same time.
+			 */
+			dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+			if (dtor_btf_id < 0) {
+				ret = dtor_btf_id;
+				goto end_btf;
+			}
+
+			dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+			if (!dtor_func) {
+				ret = -ENOENT;
+				goto end_btf;
+			}
+
+			if (btf_is_module(kernel_btf)) {
+				mod = btf_try_get_module(kernel_btf);
+				if (!mod) {
+					ret = -ENXIO;
+					goto end_btf;
+				}
+			}
+
+			/* We already verified dtor_func to be btf_type_is_func
+			 * in register_btf_id_dtor_kfuncs.
+			 */
+			dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+			addr = kallsyms_lookup_name(dtor_func_name);
+			if (!addr) {
+				ret = -EINVAL;
+				goto end_mod;
+			}
+			tab->off[i].kptr.dtor = (void *)addr;
+		}
+
 		tab->off[i].offset = info_arr[i].off;
 		tab->off[i].type = info_arr[i].type;
 		tab->off[i].kptr.btf_id = id;
 		tab->off[i].kptr.btf = kernel_btf;
+		tab->off[i].kptr.module = mod;
 	}
 	tab->nr_off = nr_off;
 	return tab;
+end_mod:
+	module_put(mod);
+end_btf:
+	btf_put(kernel_btf);
 end:
-	while (i--)
+	while (i--) {
 		btf_put(tab->off[i].kptr.btf);
+		if (tab->off[i].kptr.module)
+			module_put(tab->off[i].kptr.module);
+	}
 	kfree(tab);
 	return ERR_PTR(ret);
 }
@@ -7111,6 +7165,43 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
 	return dtor->kfunc_btf_id;
 }
 
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+	const struct btf_type *dtor_func, *dtor_func_proto, *t;
+	const struct btf_param *args;
+	s32 dtor_btf_id;
+	u32 nr_args, i;
+
+	for (i = 0; i < cnt; i++) {
+		dtor_btf_id = dtors[i].kfunc_btf_id;
+
+		dtor_func = btf_type_by_id(btf, dtor_btf_id);
+		if (!dtor_func || !btf_type_is_func(dtor_func))
+			return -EINVAL;
+
+		dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+		if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+			return -EINVAL;
+
+		/* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+		t = btf_type_by_id(btf, dtor_func_proto->type);
+		if (!t || !btf_type_is_void(t))
+			return -EINVAL;
+
+		nr_args = btf_type_vlen(dtor_func_proto);
+		if (nr_args != 1)
+			return -EINVAL;
+		args = btf_params(dtor_func_proto);
+		t = btf_type_by_id(btf, args[0].type);
+		/* Allow any pointer type, as width on targets Linux supports
+		 * will be same for all pointer types (i.e. sizeof(void *))
+		 */
+		if (!t || !btf_type_is_ptr(t))
+			return -EINVAL;
+	}
+	return 0;
+}
+
 /* This function must be invoked only from initcalls/module init functions */
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner)
@@ -7141,6 +7232,11 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
 		goto end;
 	}
 
+	/* Ensure that the prototype of dtor kfuncs being registered is sane */
+	ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+	if (ret < 0)
+		goto end;
+
 	tab = btf->dtor_kfunc_tab;
 	/* Only one call allowed for modules */
 	if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c68fbebc8c00..7351e61bd683 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -238,7 +238,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 	u32 num_entries = htab->map.max_entries;
 	int i;
 
-	if (likely(!map_value_has_timer(&htab->map)))
+	if (!map_value_has_timer(&htab->map))
 		return;
 	if (htab_has_extra_elems(htab))
 		num_entries += num_possible_cpus();
@@ -254,6 +254,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 	}
 }
 
+static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+{
+	u32 num_entries = htab->map.max_entries;
+	int i;
+
+	if (!map_value_has_kptrs(&htab->map))
+		return;
+	if (htab_has_extra_elems(htab))
+		num_entries += num_possible_cpus();
+
+	for (i = 0; i < num_entries; i++) {
+		struct htab_elem *elem;
+
+		elem = get_htab_elem(htab, i);
+		bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+		cond_resched();
+	}
+}
+
 static void htab_free_elems(struct bpf_htab *htab)
 {
 	int i;
@@ -725,12 +744,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
-static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+static void check_and_free_fields(struct bpf_htab *htab,
+				  struct htab_elem *elem)
 {
-	if (unlikely(map_value_has_timer(&htab->map)))
-		bpf_timer_cancel_and_free(elem->key +
-					  round_up(htab->map.key_size, 8) +
-					  htab->map.timer_off);
+	void *map_value = elem->key + round_up(htab->map.key_size, 8);
+
+	if (map_value_has_timer(&htab->map))
+		bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
+	if (map_value_has_kptrs(&htab->map))
+		bpf_map_free_kptrs(&htab->map, map_value);
 }
 
 /* It is called from the bpf_lru_list when the LRU needs to delete
@@ -757,7 +779,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l == tgt_l) {
 			hlist_nulls_del_rcu(&l->hash_node);
-			check_and_free_timer(htab, l);
+			check_and_free_fields(htab, l);
 			break;
 		}
 
@@ -829,7 +851,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
-	check_and_free_timer(htab, l);
+	check_and_free_fields(htab, l);
 	kfree(l);
 }
 
@@ -857,7 +879,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	htab_put_fd_value(htab, l);
 
 	if (htab_is_prealloc(htab)) {
-		check_and_free_timer(htab, l);
+		check_and_free_fields(htab, l);
 		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
@@ -1104,7 +1126,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		if (!htab_is_prealloc(htab))
 			free_htab_elem(htab, l_old);
 		else
-			check_and_free_timer(htab, l_old);
+			check_and_free_fields(htab, l_old);
 	}
 	ret = 0;
 err:
@@ -1114,7 +1136,7 @@ err:
 
 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 {
-	check_and_free_timer(htab, elem);
+	check_and_free_fields(htab, elem);
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
@@ -1419,8 +1441,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
 		struct hlist_nulls_node *n;
 		struct htab_elem *l;
 
-		hlist_nulls_for_each_entry(l, n, head, hash_node)
-			check_and_free_timer(htab, l);
+		hlist_nulls_for_each_entry(l, n, head, hash_node) {
+			/* We don't reset or free kptr on uref dropping to zero,
+			 * hence just free timer.
+			 */
+			bpf_timer_cancel_and_free(l->key +
+						  round_up(htab->map.key_size, 8) +
+						  htab->map.timer_off);
+		}
 		cond_resched_rcu();
 	}
 	rcu_read_unlock();
@@ -1430,7 +1458,8 @@ static void htab_map_free_timers(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 
-	if (likely(!map_value_has_timer(&htab->map)))
+	/* We don't reset or free kptr on uref dropping to zero. */
+	if (!map_value_has_timer(&htab->map))
 		return;
 	if (!htab_is_prealloc(htab))
 		htab_free_malloced_timers(htab);
@@ -1453,11 +1482,14 @@ static void htab_map_free(struct bpf_map *map)
 	 * not have executed. Wait for them.
 	 */
 	rcu_barrier();
-	if (!htab_is_prealloc(htab))
+	if (!htab_is_prealloc(htab)) {
 		delete_all_elements(htab);
-	else
+	} else {
+		htab_free_prealloced_kptrs(htab);
 		prealloc_destroy(htab);
+	}
 
+	bpf_map_free_kptr_off_tab(map);
 	free_percpu(htab->extra_elems);
 	bpf_map_area_free(htab->buckets);
 	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4d419a3effc4..e0aead17dff4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -507,8 +507,11 @@ void bpf_map_free_kptr_off_tab(struct bpf_map *map)
 
 	if (!map_value_has_kptrs(map))
 		return;
-	for (i = 0; i < tab->nr_off; i++)
+	for (i = 0; i < tab->nr_off; i++) {
+		if (tab->off[i].kptr.module)
+			module_put(tab->off[i].kptr.module);
 		btf_put(tab->off[i].kptr.btf);
+	}
 	kfree(tab);
 	map->kptr_off_tab = NULL;
 }
@@ -525,8 +528,18 @@ struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
 	if (!new_tab)
 		return ERR_PTR(-ENOMEM);
 	/* Do a deep copy of the kptr_off_tab */
-	for (i = 0; i < tab->nr_off; i++)
+	for (i = 0; i < tab->nr_off; i++) {
 		btf_get(tab->off[i].kptr.btf);
+		if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
+			while (i--) {
+				if (tab->off[i].kptr.module)
+					module_put(tab->off[i].kptr.module);
+				btf_put(tab->off[i].kptr.btf);
+			}
+			kfree(new_tab);
+			return ERR_PTR(-ENXIO);
+		}
+	}
 	return new_tab;
 }
 
@@ -546,6 +559,33 @@ bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_ma
 	return !memcmp(tab_a, tab_b, size);
 }
 
+/* Caller must ensure map_value_has_kptrs is true. Note that this function can
+ * be called on a map value while the map_value is visible to BPF programs, as
+ * it ensures the correct synchronization, and we already enforce the same using
+ * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
+ */
+void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+{
+	struct bpf_map_value_off *tab = map->kptr_off_tab;
+	unsigned long *btf_id_ptr;
+	int i;
+
+	for (i = 0; i < tab->nr_off; i++) {
+		struct bpf_map_value_off_desc *off_desc = &tab->off[i];
+		unsigned long old_ptr;
+
+		btf_id_ptr = map_value + off_desc->offset;
+		if (off_desc->type == BPF_KPTR_UNREF) {
+			u64 *p = (u64 *)btf_id_ptr;
+
+			WRITE_ONCE(p, 0);
+			continue;
+		}
+		old_ptr = xchg(btf_id_ptr, 0);
+		off_desc->kptr.dtor((void *)old_ptr);
+	}
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -553,9 +593,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	security_bpf_map_free(map);
 	kfree(map->off_arr);
-	bpf_map_free_kptr_off_tab(map);
 	bpf_map_release_memcg(map);
-	/* implementation dependent freeing */
+	/* implementation dependent freeing, map_free callback also does
+	 * bpf_map_free_kptr_off_tab, if needed.
+	 */
 	map->ops->map_free(map);
 }
 
-- 
cgit v1.2.3


From a1ef195996526da45bbc9710849254023df75aea Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:56 +0530
Subject: bpf: Teach verifier about kptr_get kfunc helpers

We introduce a new style of kfunc helpers, namely *_kptr_get, where they
take pointer to the map value which points to a referenced kernel
pointer contained in the map. Since this is referenced, only
bpf_kptr_xchg from BPF side and xchg from kernel side is allowed to
change the current value, and each pointer that resides in that location
would be referenced, and RCU protected (this must be kept in mind while
adding kernel types embeddable as reference kptr in BPF maps).

This means that if do the load of the pointer value in an RCU read
section, and find a live pointer, then as long as we hold RCU read lock,
it won't be freed by a parallel xchg + release operation. This allows us
to implement a safe refcount increment scheme. Hence, enforce that first
argument of all such kfunc is a proper PTR_TO_MAP_VALUE pointing at the
right offset to referenced pointer.

For the rest of the arguments, they are subjected to typical kfunc
argument checks, hence allowing some flexibility in passing more intent
into how the reference should be taken.

For instance, in case of struct nf_conn, it is not freed until RCU grace
period ends, but can still be reused for another tuple once refcount has
dropped to zero. Hence, a bpf_ct_kptr_get helper not only needs to call
refcount_inc_not_zero, but also do a tuple match after incrementing the
reference, and when it fails to match it, put the reference again and
return NULL.

This can be implemented easily if we allow passing additional parameters
to the bpf_ct_kptr_get kfunc, like a struct bpf_sock_tuple * and a
tuple__sz pair.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-9-memxor@gmail.com
---
 kernel/bpf/btf.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1f2012fd89fb..494437fb40b7 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6089,11 +6089,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
+	bool rel = false, kptr_get = false;
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
 	int ref_regno = 0, ret;
-	bool rel = false;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
@@ -6119,10 +6119,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* Only kfunc can be release func */
-	if (is_kfunc)
+	if (is_kfunc) {
+		/* Only kfunc can be release func */
 		rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
 						BTF_KFUNC_TYPE_RELEASE, func_id);
+		kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+						     BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id);
+	}
+
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
@@ -6154,8 +6158,52 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		if (ret < 0)
 			return ret;
 
-		if (btf_get_prog_ctx_type(log, btf, t,
-					  env->prog->type, i)) {
+		/* kptr_get is only true for kfunc */
+		if (i == 0 && kptr_get) {
+			struct bpf_map_value_off_desc *off_desc;
+
+			if (reg->type != PTR_TO_MAP_VALUE) {
+				bpf_log(log, "arg#0 expected pointer to map value\n");
+				return -EINVAL;
+			}
+
+			/* check_func_arg_reg_off allows var_off for
+			 * PTR_TO_MAP_VALUE, but we need fixed offset to find
+			 * off_desc.
+			 */
+			if (!tnum_is_const(reg->var_off)) {
+				bpf_log(log, "arg#0 must have constant offset\n");
+				return -EINVAL;
+			}
+
+			off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
+			if (!off_desc || off_desc->type != BPF_KPTR_REF) {
+				bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
+					reg->off + reg->var_off.value);
+				return -EINVAL;
+			}
+
+			if (!btf_type_is_ptr(ref_t)) {
+				bpf_log(log, "arg#0 BTF type must be a double pointer\n");
+				return -EINVAL;
+			}
+
+			ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
+			ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+			if (!btf_type_is_struct(ref_t)) {
+				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+					func_name, i, btf_type_str(ref_t), ref_tname);
+				return -EINVAL;
+			}
+			if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
+						  off_desc->kptr.btf_id)) {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
+					func_name, i, btf_type_str(ref_t), ref_tname);
+				return -EINVAL;
+			}
+			/* rest of the arguments can be anything, like normal kfunc */
+		} else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
-- 
cgit v1.2.3


From 2ab3b3808eb17f729edfd69e061667ca0a427195 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 25 Apr 2022 03:18:57 +0530
Subject: bpf: Make BTF type match stricter for release arguments

The current of behavior of btf_struct_ids_match for release arguments is
that when type match fails, it retries with first member type again
(recursively). Since the offset is already 0, this is akin to just
casting the pointer in normal C, since if type matches it was just
embedded inside parent sturct as an object. However, we want to reject
cases for release function type matching, be it kfunc or BPF helpers.

An example is the following:

struct foo {
	struct bar b;
};

struct foo *v = acq_foo();
rel_bar(&v->b); // btf_struct_ids_match fails btf_types_are_same, then
		// retries with first member type and succeeds, while
		// it should fail.

Hence, don't walk the struct and only rely on btf_types_are_same for
strict mode. All users of strict mode must be dealing with zero offset
anyway, since otherwise they would want the struct to be walked.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20220424214901.2743946-10-memxor@gmail.com
---
 kernel/bpf/btf.c      | 14 ++++++++++----
 kernel/bpf/verifier.c | 18 +++++++++++++++---
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 494437fb40b7..4cfaf5eebecd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5746,7 +5746,8 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1,
 
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
-			  const struct btf *need_btf, u32 need_type_id)
+			  const struct btf *need_btf, u32 need_type_id,
+			  bool strict)
 {
 	const struct btf_type *type;
 	enum bpf_type_flag flag;
@@ -5755,7 +5756,12 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
 	/* Are we already done? */
 	if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
 		return true;
-
+	/* In case of strict type match, we do not walk struct, the top level
+	 * type match must succeed. When strict is true, off should have already
+	 * been 0.
+	 */
+	if (strict)
+		return false;
 again:
 	type = btf_type_by_id(btf, id);
 	if (!type)
@@ -6197,7 +6203,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				return -EINVAL;
 			}
 			if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
-						  off_desc->kptr.btf_id)) {
+						  off_desc->kptr.btf_id, true)) {
 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
 					func_name, i, btf_type_str(ref_t), ref_tname);
 				return -EINVAL;
@@ -6250,7 +6256,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			reg_ref_tname = btf_name_by_offset(reg_btf,
 							   reg_ref_t->name_off);
 			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
-						  reg->off, btf, ref_id)) {
+						  reg->off, btf, ref_id, rel && reg->ref_obj_id)) {
 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
 					func_name, i,
 					btf_type_str(ref_t), ref_tname,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 955c3125576a..813f6ee80419 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3551,10 +3551,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	 *                    // to match type
 	 *
 	 * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
-	 * is zero.
+	 * is zero. We must also ensure that btf_struct_ids_match does not walk
+	 * the struct to match type against first member of struct, i.e. reject
+	 * second case from above. Hence, when type is BPF_KPTR_REF, we set
+	 * strict mode to true for type match.
 	 */
 	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-				  off_desc->kptr.btf, off_desc->kptr.btf_id))
+				  off_desc->kptr.btf, off_desc->kptr.btf_id,
+				  off_desc->type == BPF_KPTR_REF))
 		goto bad_type;
 	return 0;
 bad_type:
@@ -5593,6 +5597,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 
 found:
 	if (reg->type == PTR_TO_BTF_ID) {
+		/* For bpf_sk_release, it needs to match against first member
+		 * 'struct sock_common', hence make an exception for it. This
+		 * allows bpf_sk_release to work for multiple socket types.
+		 */
+		bool strict_type_match = arg_type_is_release(arg_type) &&
+					 meta->func_id != BPF_FUNC_sk_release;
+
 		if (!arg_btf_id) {
 			if (!compatible->btf_id) {
 				verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -5605,7 +5616,8 @@ found:
 			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
 				return -EACCES;
 		} else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-						 btf_vmlinux, *arg_btf_id)) {
+						 btf_vmlinux, *arg_btf_id,
+						 strict_type_match)) {
 			verbose(env, "R%d is of type %s but %s is expected\n",
 				regno, kernel_type_name(reg->btf, reg->btf_id),
 				kernel_type_name(btf_vmlinux, *arg_btf_id));
-- 
cgit v1.2.3


From 8e274732115f63c1d09136284431b3555bd5cc56 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 25 Apr 2022 23:04:28 +0206
Subject: printk: extend console_lock for per-console locking

Currently threaded console printers synchronize against each
other using console_lock(). However, different console drivers
are unrelated and do not require any synchronization between
each other. Removing the synchronization between the threaded
console printers will allow each console to print at its own
speed.

But the threaded consoles printers do still need to synchronize
against console_lock() callers. Introduce a per-console mutex
and a new console boolean field @blocked to provide this
synchronization.

console_lock() is modified so that it must acquire the mutex
of each console in order to set the @blocked field. Console
printing threads will acquire their mutex while printing a
record. If @blocked was set, the thread will go back to sleep
instead of printing.

The reason for the @blocked boolean field is so that
console_lock() callers do not need to acquire multiple console
mutexes simultaneously, which would introduce unnecessary
complexity due to nested mutex locking. Also, a new field
was chosen instead of adding a new @flags value so that the
blocked status could be checked without concern of reading
inconsistent values due to @flags updates from other contexts.

Threaded console printers also need to synchronize against
console_trylock() callers. Since console_trylock() may be
called from any context, the per-console mutex cannot be used
for this synchronization. (mutex_trylock() cannot be called
from atomic contexts.) Introduce a global atomic counter to
identify if any threaded printers are active. The threaded
printers will also check the atomic counter to identify if the
console has been locked by another task via console_trylock().

Note that @console_sem is still used to provide synchronization
between console_lock() and console_trylock() callers.

A locking overview for console_lock(), console_trylock(), and the
threaded printers is as follows (pseudo code):

console_lock()
{
        down(&console_sem);
        for_each_console(con) {
                mutex_lock(&con->lock);
                con->blocked = true;
                mutex_unlock(&con->lock);
        }
        /* console_lock acquired */
}

console_trylock()
{
        if (down_trylock(&console_sem) == 0) {
                if (atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0) {
                        /* console_lock acquired */
                }
        }
}

threaded_printer()
{
        mutex_lock(&con->lock);
        if (!con->blocked) {
		/* console_lock() callers blocked */

                if (atomic_inc_unless_negative(&console_kthreads_active)) {
                        /* console_trylock() callers blocked */

                        con->write();

                        atomic_dec(&console_lock_count);
                }
        }
        mutex_unlock(&con->lock);
}

The console owner and waiter logic now only applies between contexts
that have taken the console_lock via console_trylock(). Threaded
printers never take the console_lock, so they do not have a
console_lock to handover. Tasks that have used console_lock() will
block the threaded printers using a mutex and if the console_lock
is handed over to an atomic context, it would be unable to unblock
the threaded printers. However, the console_trylock() case is
really the only scenario that is interesting for handovers anyway.

@panic_console_dropped must change to atomic_t since it is no longer
protected exclusively by the console_lock.

Since threaded printers remain asleep if they see that the console
is locked, they now must be explicitly woken in __console_unlock().
This means wake_up_klogd() calls following a console_unlock() are
no longer necessary and are removed.

Also note that threaded printers no longer need to check
@console_suspended. The check for the @blocked field implicitly
covers the suspended console case.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/878rrs6ft7.fsf@jogness.linutronix.de
---
 kernel/printk/printk.c | 261 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 205 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e4cdc424c826..750d1229cc11 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -223,6 +223,33 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 /* Number of registered extended console drivers. */
 static int nr_ext_console_drivers;
 
+/*
+ * Used to synchronize printing kthreads against direct printing via
+ * console_trylock/console_unlock.
+ *
+ * Values:
+ * -1 = console kthreads atomically blocked (via global trylock)
+ *  0 = no kthread printing, console not locked (via trylock)
+ * >0 = kthread(s) actively printing
+ *
+ * Note: For synchronizing against direct printing via
+ *       console_lock/console_unlock, see the @lock variable in
+ *       struct console.
+ */
+static atomic_t console_kthreads_active = ATOMIC_INIT(0);
+
+#define console_kthreads_atomic_tryblock() \
+	(atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0)
+#define console_kthreads_atomic_unblock() \
+	atomic_cmpxchg(&console_kthreads_active, -1, 0)
+#define console_kthreads_atomically_blocked() \
+	(atomic_read(&console_kthreads_active) == -1)
+
+#define console_kthread_printing_tryenter() \
+	atomic_inc_unless_negative(&console_kthreads_active)
+#define console_kthread_printing_exit() \
+	atomic_dec(&console_kthreads_active)
+
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -270,6 +297,49 @@ static bool panic_in_progress(void)
 	return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
 }
 
+/*
+ * Tracks whether kthread printers are all blocked. A value of true implies
+ * that the console is locked via console_lock() or the console is suspended.
+ * Writing to this variable requires holding @console_sem.
+ */
+static bool console_kthreads_blocked;
+
+/*
+ * Block all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
+ */
+static void console_kthreads_block(void)
+{
+	struct console *con;
+
+	for_each_console(con) {
+		mutex_lock(&con->lock);
+		con->blocked = true;
+		mutex_unlock(&con->lock);
+	}
+
+	console_kthreads_blocked = true;
+}
+
+/*
+ * Unblock all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
+ */
+static void console_kthreads_unblock(void)
+{
+	struct console *con;
+
+	for_each_console(con) {
+		mutex_lock(&con->lock);
+		con->blocked = false;
+		mutex_unlock(&con->lock);
+	}
+
+	console_kthreads_blocked = false;
+}
+
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
@@ -2603,13 +2673,6 @@ void resume_console(void)
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
-
-	/*
-	 * While suspended, new records may have been added to the
-	 * ringbuffer. Wake up the kthread printers to print them.
-	 */
-	wake_up_klogd();
-
 	pr_flush(1000, true);
 }
 
@@ -2628,9 +2691,14 @@ static int console_cpu_notify(unsigned int cpu)
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
-
-		/* Wake kthread printers. Some may have become usable. */
-		wake_up_klogd();
+		else {
+			/*
+			 * If a new CPU comes online, the conditions for
+			 * printer_should_wake() may have changed for some
+			 * kthread printer with !CON_ANYTIME.
+			 */
+			wake_up_klogd();
+		}
 	}
 	return 0;
 }
@@ -2650,6 +2718,7 @@ void console_lock(void)
 	down_console_sem();
 	if (console_suspended)
 		return;
+	console_kthreads_block();
 	console_locked = 1;
 	console_may_schedule = 1;
 }
@@ -2671,6 +2740,10 @@ int console_trylock(void)
 		up_console_sem();
 		return 0;
 	}
+	if (!console_kthreads_atomic_tryblock()) {
+		up_console_sem();
+		return 0;
+	}
 	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
@@ -2679,7 +2752,7 @@ EXPORT_SYMBOL(console_trylock);
 
 int is_console_locked(void)
 {
-	return console_locked;
+	return (console_locked || atomic_read(&console_kthreads_active));
 }
 EXPORT_SYMBOL(is_console_locked);
 
@@ -2723,7 +2796,7 @@ static inline bool __console_is_usable(short flags)
  * Check if the given console is currently capable and allowed to print
  * records.
  *
- * Requires the console_lock.
+ * Requires holding the console_lock.
  */
 static inline bool console_is_usable(struct console *con)
 {
@@ -2736,6 +2809,22 @@ static inline bool console_is_usable(struct console *con)
 static void __console_unlock(void)
 {
 	console_locked = 0;
+
+	/*
+	 * Depending on whether console_lock() or console_trylock() was used,
+	 * appropriately allow the kthread printers to continue.
+	 */
+	if (console_kthreads_blocked)
+		console_kthreads_unblock();
+	else
+		console_kthreads_atomic_unblock();
+
+	/*
+	 * New records may have arrived while the console was locked.
+	 * Wake the kthread printers to print them.
+	 */
+	wake_up_klogd();
+
 	up_console_sem();
 }
 
@@ -2753,17 +2842,19 @@ static void __console_unlock(void)
  *
  * @handover will be set to true if a printk waiter has taken over the
  * console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false.
+ * console_lock. Otherwise it is set to false. A NULL pointer may be provided
+ * to disable allowing the console_lock to be taken over by a printk waiter.
  *
  * Returns false if the given console has no next record to print, otherwise
  * true.
  *
- * Requires the console_lock.
+ * Requires the console_lock if @handover is non-NULL.
+ * Requires con->lock otherwise.
  */
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
-				     char *dropped_text, bool *handover)
+static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
+				       char *dropped_text, bool *handover)
 {
-	static int panic_console_dropped;
+	static atomic_t panic_console_dropped = ATOMIC_INIT(0);
 	struct printk_info info;
 	struct printk_record r;
 	unsigned long flags;
@@ -2772,7 +2863,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
 
 	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
 
-	*handover = false;
+	if (handover)
+		*handover = false;
 
 	if (!prb_read_valid(prb, con->seq, &r))
 		return false;
@@ -2780,7 +2872,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
 	if (con->seq != r.info->seq) {
 		con->dropped += r.info->seq - con->seq;
 		con->seq = r.info->seq;
-		if (panic_in_progress() && panic_console_dropped++ > 10) {
+		if (panic_in_progress() &&
+		    atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
 			suppress_panic_printk = 1;
 			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
 		}
@@ -2802,31 +2895,61 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
 		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
 	}
 
-	/*
-	 * While actively printing out messages, if another printk()
-	 * were to occur on another CPU, it may wait for this one to
-	 * finish. This task can not be preempted if there is a
-	 * waiter waiting to take over.
-	 *
-	 * Interrupts are disabled because the hand over to a waiter
-	 * must not be interrupted until the hand over is completed
-	 * (@console_waiter is cleared).
-	 */
-	printk_safe_enter_irqsave(flags);
-	console_lock_spinning_enable();
+	if (handover) {
+		/*
+		 * While actively printing out messages, if another printk()
+		 * were to occur on another CPU, it may wait for this one to
+		 * finish. This task can not be preempted if there is a
+		 * waiter waiting to take over.
+		 *
+		 * Interrupts are disabled because the hand over to a waiter
+		 * must not be interrupted until the hand over is completed
+		 * (@console_waiter is cleared).
+		 */
+		printk_safe_enter_irqsave(flags);
+		console_lock_spinning_enable();
+
+		/* don't trace irqsoff print latency */
+		stop_critical_timings();
+	}
 
-	stop_critical_timings();	/* don't trace print latency */
 	call_console_driver(con, write_text, len, dropped_text);
-	start_critical_timings();
 
 	con->seq++;
 
-	*handover = console_lock_spinning_disable_and_check();
-	printk_safe_exit_irqrestore(flags);
+	if (handover) {
+		start_critical_timings();
+		*handover = console_lock_spinning_disable_and_check();
+		printk_safe_exit_irqrestore(flags);
+	}
 skip:
 	return true;
 }
 
+/*
+ * Print a record for a given console, but allow another printk() caller to
+ * take over the console_lock and continue printing.
+ *
+ * Requires the console_lock, but depending on @handover after the call, the
+ * caller may no longer have the console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text,
+						  char *dropped_text, bool *handover)
+{
+	/*
+	 * Handovers are only supported if threaded printers are atomically
+	 * blocked. The context taking over the console_lock may be atomic.
+	 */
+	if (!console_kthreads_atomically_blocked()) {
+		*handover = false;
+		handover = NULL;
+	}
+
+	return __console_emit_next_record(con, text, ext_text, dropped_text, handover);
+}
+
 /*
  * Print out all remaining records to all consoles.
  *
@@ -2878,13 +3001,11 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 
 			if (con->flags & CON_EXTENDED) {
 				/* Extended consoles do not print "dropped messages". */
-				progress = console_emit_next_record(con, &text[0],
-								    &ext_text[0], NULL,
-								    handover);
+				progress = console_emit_next_record_transferable(con, &text[0],
+								&ext_text[0], NULL, handover);
 			} else {
-				progress = console_emit_next_record(con, &text[0],
-								    NULL, &dropped_text[0],
-								    handover);
+				progress = console_emit_next_record_transferable(con, &text[0],
+								NULL, &dropped_text[0], handover);
 			}
 			if (*handover)
 				return false;
@@ -2999,6 +3120,10 @@ void console_unblank(void)
 	if (oops_in_progress) {
 		if (down_trylock_console_sem() != 0)
 			return;
+		if (!console_kthreads_atomic_tryblock()) {
+			up_console_sem();
+			return;
+		}
 	} else
 		console_lock();
 
@@ -3081,10 +3206,6 @@ void console_start(struct console *console)
 	console_lock();
 	console->flags |= CON_ENABLED;
 	console_unlock();
-
-	/* Wake the newly enabled kthread printer. */
-	wake_up_klogd();
-
 	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
@@ -3286,6 +3407,8 @@ void register_console(struct console *newcon)
 
 	newcon->dropped = 0;
 	newcon->thread = NULL;
+	newcon->blocked = true;
+	mutex_init(&newcon->lock);
 
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/* Get a consistent copy of @syslog_seq. */
@@ -3586,6 +3709,19 @@ static void printk_fallback_preferred_direct(void)
 	console_unlock();
 }
 
+/*
+ * Print a record for a given console, not allowing another printk() caller
+ * to take over. This is appropriate for contexts that do not have the
+ * console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+				     char *dropped_text)
+{
+	return __console_emit_next_record(con, text, ext_text, dropped_text, NULL);
+}
+
 static bool printer_should_wake(struct console *con, u64 seq)
 {
 	short flags;
@@ -3593,8 +3729,10 @@ static bool printer_should_wake(struct console *con, u64 seq)
 	if (kthread_should_stop() || !printk_kthreads_available)
 		return true;
 
-	if (console_suspended)
+	if (con->blocked ||
+	    console_kthreads_atomically_blocked()) {
 		return false;
+	}
 
 	/*
 	 * This is an unsafe read from con->flags, but a false positive is
@@ -3615,7 +3753,6 @@ static int printk_kthread_func(void *data)
 	struct console *con = data;
 	char *dropped_text = NULL;
 	char *ext_text = NULL;
-	bool handover;
 	u64 seq = 0;
 	char *text;
 	int error;
@@ -3665,15 +3802,27 @@ static int printk_kthread_func(void *data)
 		if (error)
 			continue;
 
-		console_lock();
+		error = mutex_lock_interruptible(&con->lock);
+		if (error)
+			continue;
 
-		if (console_suspended) {
-			up_console_sem();
+		if (con->blocked ||
+		    !console_kthread_printing_tryenter()) {
+			/* Another context has locked the console_lock. */
+			mutex_unlock(&con->lock);
 			continue;
 		}
 
-		if (!console_is_usable(con)) {
-			__console_unlock();
+		/*
+		 * Although this context has not locked the console_lock, it
+		 * is known that the console_lock is not locked and it is not
+		 * possible for any other context to lock the console_lock.
+		 * Therefore it is safe to read con->flags.
+		 */
+
+		if (!__console_is_usable(con->flags)) {
+			console_kthread_printing_exit();
+			mutex_unlock(&con->lock);
 			continue;
 		}
 
@@ -3686,13 +3835,13 @@ static int printk_kthread_func(void *data)
 		 * which can conditionally invoke cond_resched().
 		 */
 		console_may_schedule = 0;
-		console_emit_next_record(con, text, ext_text, dropped_text, &handover);
-		if (handover)
-			continue;
+		console_emit_next_record(con, text, ext_text, dropped_text);
 
 		seq = con->seq;
 
-		__console_unlock();
+		console_kthread_printing_exit();
+
+		mutex_unlock(&con->lock);
 	}
 
 	con_printk(KERN_INFO, con, "printing thread stopped\n");
-- 
cgit v1.2.3


From ab406816fca009349b89cbde885daf68a8c77e33 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 21 Apr 2022 23:28:50 +0206
Subject: printk: remove @console_locked

The static global variable @console_locked is used to help debug
VT code to make sure that certain code paths are running with
the console_lock held. However, this information is also available
with the static global variable @console_kthreads_blocked (for
locking via console_lock()), and the static global variable
@console_kthreads_active (for locking via console_trylock()).

Remove @console_locked and update is_console_locked() to use the
alternative variables.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220421212250.565456-16-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 750d1229cc11..f66d6e72a642 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -340,15 +340,7 @@ static void console_kthreads_unblock(void)
 	console_kthreads_blocked = false;
 }
 
-/*
- * This is used for debugging the mess that is the VT code by
- * keeping track if we have the console semaphore held. It's
- * definitely not the perfect debug tool (we don't know if _WE_
- * hold it and are racing, but it helps tracking those weird code
- * paths in the console code where we end up in places I want
- * locked without the console semaphore held).
- */
-static int console_locked, console_suspended;
+static int console_suspended;
 
 /*
  *	Array of consoles built from command line options (console=)
@@ -2719,7 +2711,6 @@ void console_lock(void)
 	if (console_suspended)
 		return;
 	console_kthreads_block();
-	console_locked = 1;
 	console_may_schedule = 1;
 }
 EXPORT_SYMBOL(console_lock);
@@ -2744,15 +2735,26 @@ int console_trylock(void)
 		up_console_sem();
 		return 0;
 	}
-	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
 }
 EXPORT_SYMBOL(console_trylock);
 
+/*
+ * This is used to help to make sure that certain paths within the VT code are
+ * running with the console lock held. It is definitely not the perfect debug
+ * tool (it is not known if the VT code is the task holding the console lock),
+ * but it helps tracking those weird code paths in the console code such as
+ * when the console is suspended: where the console is not locked but no
+ * console printing may occur.
+ *
+ * Note: This returns true when the console is suspended but is not locked.
+ *       This is intentional because the VT code must consider that situation
+ *       the same as if the console was locked.
+ */
 int is_console_locked(void)
 {
-	return (console_locked || atomic_read(&console_kthreads_active));
+	return (console_kthreads_blocked || atomic_read(&console_kthreads_active));
 }
 EXPORT_SYMBOL(is_console_locked);
 
@@ -2808,8 +2810,6 @@ static inline bool console_is_usable(struct console *con)
 
 static void __console_unlock(void)
 {
-	console_locked = 0;
-
 	/*
 	 * Depending on whether console_lock() or console_trylock() was used,
 	 * appropriately allow the kthread printers to continue.
@@ -3127,7 +3127,6 @@ void console_unblank(void)
 	} else
 		console_lock();
 
-	console_locked = 1;
 	console_may_schedule = 0;
 	for_each_console(c)
 		if ((c->flags & CON_ENABLED) && c->unblank)
-- 
cgit v1.2.3


From 1d661ed54d8613c97bcff2c7d6181c61e482a1da Mon Sep 17 00:00:00 2001
From: Adam Zabrocki <pi3@pi3.com.pl>
Date: Fri, 22 Apr 2022 18:40:27 +0200
Subject: kprobes: Fix KRETPROBES when CONFIG_KRETPROBE_ON_RETHOOK is set

The recent kernel change in 73f9b911faa7 ("kprobes: Use rethook for kretprobe
if possible"), introduced a potential NULL pointer dereference bug in the
KRETPROBE mechanism. The official Kprobes documentation defines that "Any or
all handlers can be NULL". Unfortunately, there is a missing return handler
verification to fulfill these requirements and can result in a NULL pointer
dereference bug.

This patch adds such verification in kretprobe_rethook_handler() function.

Fixes: 73f9b911faa7 ("kprobes: Use rethook for kretprobe if possible")
Signed-off-by: Adam Zabrocki <pi3@pi3.com.pl>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Anil S. Keshavamurthy <anil.s.keshavamurthy@intel.com>
Link: https://lore.kernel.org/bpf/20220422164027.GA7862@pi3.com.pl
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dbe57df2e199..dd58c0be9ce2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2126,7 +2126,7 @@ static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
 	struct kprobe_ctlblk *kcb;
 
 	/* The data must NOT be null. This means rethook data structure is broken. */
-	if (WARN_ON_ONCE(!data))
+	if (WARN_ON_ONCE(!data) || !rp->handler)
 		return;
 
 	__this_cpu_write(current_kprobe, &rp->kp);
-- 
cgit v1.2.3


From c317ab71facc2cd0a94145973318a4c914e11acc Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Mon, 25 Apr 2022 21:32:47 +0800
Subject: bpf: Compute map_btf_id during build time

For now, the field 'map_btf_id' in 'struct bpf_map_ops' for all map
types are computed during vmlinux-btf init:

  btf_parse_vmlinux() -> btf_vmlinux_map_ids_init()

It will lookup the btf_type according to the 'map_btf_name' field in
'struct bpf_map_ops'. This process can be done during build time,
thanks to Jiri's resolve_btfids.

selftest of map_ptr has passed:

  $96 map_ptr:OK
  Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c          | 26 ++++++++------------------
 kernel/bpf/bloom_filter.c      |  6 +++---
 kernel/bpf/bpf_inode_storage.c |  6 +++---
 kernel/bpf/bpf_struct_ops.c    |  6 +++---
 kernel/bpf/bpf_task_storage.c  |  5 ++---
 kernel/bpf/btf.c               | 40 ----------------------------------------
 kernel/bpf/cpumap.c            |  6 +++---
 kernel/bpf/devmap.c            | 10 ++++------
 kernel/bpf/hashtab.c           | 22 +++++++---------------
 kernel/bpf/local_storage.c     |  7 ++++---
 kernel/bpf/lpm_trie.c          |  6 +++---
 kernel/bpf/queue_stack_maps.c  | 10 ++++------
 kernel/bpf/reuseport_array.c   |  6 +++---
 kernel/bpf/ringbuf.c           |  6 +++---
 kernel/bpf/stackmap.c          |  5 ++---
 15 files changed, 52 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c3de63ce574e..b3bf31fd9458 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
 
 #include "map_in_map.h"
 
@@ -690,7 +691,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
 	return num_elems;
 }
 
-static int array_map_btf_id;
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
 const struct bpf_map_ops array_map_ops = {
 	.map_meta_equal = array_map_meta_equal,
 	.map_alloc_check = array_map_alloc_check,
@@ -711,12 +712,10 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_batch = generic_map_update_batch,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_array_elem,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &array_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
-static int percpu_array_map_btf_id;
 const struct bpf_map_ops percpu_array_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = array_map_alloc_check,
@@ -732,8 +731,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_update_batch = generic_map_update_batch,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_array_elem,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &percpu_array_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
@@ -1112,7 +1110,6 @@ static void prog_array_map_free(struct bpf_map *map)
  * Thus, prog_array_map cannot be used as an inner_map
  * and map_meta_equal is not implemented.
  */
-static int prog_array_map_btf_id;
 const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = prog_array_map_alloc,
@@ -1128,8 +1125,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 	.map_release_uref = prog_array_map_clear,
 	.map_seq_show_elem = prog_array_map_seq_show_elem,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &prog_array_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -1218,7 +1214,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static int perf_event_array_map_btf_id;
 const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = fd_array_map_alloc_check,
@@ -1231,8 +1226,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
 	.map_release = perf_event_fd_array_release,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &perf_event_array_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 };
 
 #ifdef CONFIG_CGROUPS
@@ -1255,7 +1249,6 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static int cgroup_array_map_btf_id;
 const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = fd_array_map_alloc_check,
@@ -1267,8 +1260,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &cgroup_array_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 };
 #endif
 
@@ -1342,7 +1334,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
 	return insn - insn_buf;
 }
 
-static int array_of_maps_map_btf_id;
 const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = array_of_map_alloc,
@@ -1355,6 +1346,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = array_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_array",
-	.map_btf_id = &array_of_maps_map_btf_id,
+	.map_btf_id = &array_map_btf_ids[0],
 };
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index b141a1346f72..b9ea539a5561 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -7,6 +7,7 @@
 #include <linux/err.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <linux/btf_ids.h>
 
 #define BLOOM_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
@@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map,
 	return btf_type_is_void(key_type) ? 0 : -EINVAL;
 }
 
-static int bpf_bloom_map_btf_id;
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
 const struct bpf_map_ops bloom_filter_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = bloom_map_alloc,
@@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = {
 	.map_update_elem = bloom_map_update_elem,
 	.map_delete_elem = bloom_map_delete_elem,
 	.map_check_btf = bloom_map_check_btf,
-	.map_btf_name = "bpf_bloom_filter",
-	.map_btf_id = &bpf_bloom_map_btf_id,
+	.map_btf_id = &bpf_bloom_map_btf_ids[0],
 };
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 10424a1cda51..5f7683b19199 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -245,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(smap, NULL);
 }
 
-static int inode_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
+		   bpf_local_storage_map)
 const struct bpf_map_ops inode_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -256,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
 	.map_update_elem = bpf_fd_inode_storage_update_elem,
 	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_name = "bpf_local_storage_map",
-	.map_btf_id = &inode_storage_map_btf_id,
+	.map_btf_id = &inode_storage_map_btf_ids[0],
 	.map_owner_storage_ptr = inode_storage_ptr,
 };
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index de01d37c2d3b..3a0103ad97bc 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -10,6 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/refcount.h>
 #include <linux/mutex.h>
+#include <linux/btf_ids.h>
 
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
@@ -612,7 +613,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	return map;
 }
 
-static int bpf_struct_ops_map_btf_id;
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
 const struct bpf_map_ops bpf_struct_ops_map_ops = {
 	.map_alloc_check = bpf_struct_ops_map_alloc_check,
 	.map_alloc = bpf_struct_ops_map_alloc,
@@ -622,8 +623,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 	.map_delete_elem = bpf_struct_ops_map_delete_elem,
 	.map_update_elem = bpf_struct_ops_map_update_elem,
 	.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
-	.map_btf_name = "bpf_struct_ops_map",
-	.map_btf_id = &bpf_struct_ops_map_btf_id,
+	.map_btf_id = &bpf_struct_ops_map_btf_ids[0],
 };
 
 /* "const void *" because some subsystem is
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 57904263a710..e9014dc62682 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -307,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map)
 	bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
 }
 
-static int task_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
 const struct bpf_map_ops task_storage_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -318,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_update_elem = bpf_pid_task_storage_update_elem,
 	.map_delete_elem = bpf_pid_task_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
-	.map_btf_name = "bpf_local_storage_map",
-	.map_btf_id = &task_storage_map_btf_id,
+	.map_btf_id = &task_storage_map_btf_ids[0],
 	.map_owner_storage_ptr = task_storage_ptr,
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4cfaf5eebecd..2f0b0440131c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5025,41 +5025,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 	return ctx_type;
 }
 
-static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
-#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
-#define BPF_LINK_TYPE(_id, _name)
-#define BPF_MAP_TYPE(_id, _ops) \
-	[_id] = &_ops,
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_LINK_TYPE
-#undef BPF_MAP_TYPE
-};
-
-static int btf_vmlinux_map_ids_init(const struct btf *btf,
-				    struct bpf_verifier_log *log)
-{
-	const struct bpf_map_ops *ops;
-	int i, btf_id;
-
-	for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
-		ops = btf_vmlinux_map_ops[i];
-		if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
-			continue;
-		if (!ops->map_btf_name || !ops->map_btf_id) {
-			bpf_log(log, "map type %d is misconfigured\n", i);
-			return -EINVAL;
-		}
-		btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
-					       BTF_KIND_STRUCT);
-		if (btf_id < 0)
-			return btf_id;
-		*ops->map_btf_id = btf_id;
-	}
-
-	return 0;
-}
-
 static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     struct btf *btf,
 				     const struct btf_type *t,
@@ -5125,11 +5090,6 @@ struct btf *btf_parse_vmlinux(void)
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
 	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
 
-	/* find bpf map structs for map_ptr access checking */
-	err = btf_vmlinux_map_ids_init(btf, log);
-	if (err < 0)
-		goto errout;
-
 	bpf_struct_ops_init(btf, log);
 
 	refcount_set(&btf->refcnt, 1);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 650e5d21f90d..f4860ac756cd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 #include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
 
 #include <linux/netdevice.h>   /* netif_receive_skb_list */
 #include <linux/etherdevice.h> /* eth_type_trans */
@@ -673,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 				      __cpu_map_lookup_elem);
 }
 
-static int cpu_map_btf_id;
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
 const struct bpf_map_ops cpu_map_ops = {
 	.map_meta_equal		= bpf_map_meta_equal,
 	.map_alloc		= cpu_map_alloc,
@@ -683,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_lookup_elem	= cpu_map_lookup_elem,
 	.map_get_next_key	= cpu_map_get_next_key,
 	.map_check_btf		= map_check_no_btf,
-	.map_btf_name		= "bpf_cpu_map",
-	.map_btf_id		= &cpu_map_btf_id,
+	.map_btf_id		= &cpu_map_btf_ids[0],
 	.map_redirect		= cpu_map_redirect,
 };
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 038f6d7a83e4..c2867068e5bd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
 #include <net/xdp.h>
 #include <linux/filter.h>
 #include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -1005,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
 				      __dev_map_hash_lookup_elem);
 }
 
-static int dev_map_btf_id;
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
 const struct bpf_map_ops dev_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = dev_map_alloc,
@@ -1015,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_dtab",
-	.map_btf_id = &dev_map_btf_id,
+	.map_btf_id = &dev_map_btf_ids[0],
 	.map_redirect = dev_map_redirect,
 };
 
-static int dev_map_hash_map_btf_id;
 const struct bpf_map_ops dev_map_hash_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = dev_map_alloc,
@@ -1030,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
 	.map_update_elem = dev_map_hash_update_elem,
 	.map_delete_elem = dev_map_hash_delete_elem,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_dtab",
-	.map_btf_id = &dev_map_hash_map_btf_id,
+	.map_btf_id = &dev_map_btf_ids[0],
 	.map_redirect = dev_hash_map_redirect,
 };
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 7351e61bd683..3e00e62b2218 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -10,6 +10,7 @@
 #include <linux/random.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
@@ -2137,7 +2138,7 @@ out:
 	return num_elems;
 }
 
-static int htab_map_btf_id;
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
 const struct bpf_map_ops htab_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = htab_map_alloc_check,
@@ -2154,12 +2155,10 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab),
-	.map_btf_name = "bpf_htab",
-	.map_btf_id = &htab_map_btf_id,
+	.map_btf_id = &htab_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
-static int htab_lru_map_btf_id;
 const struct bpf_map_ops htab_lru_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = htab_map_alloc_check,
@@ -2177,8 +2176,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_lru),
-	.map_btf_name = "bpf_htab",
-	.map_btf_id = &htab_lru_map_btf_id,
+	.map_btf_id = &htab_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
@@ -2284,7 +2282,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int htab_percpu_map_btf_id;
 const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = htab_map_alloc_check,
@@ -2299,12 +2296,10 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_percpu),
-	.map_btf_name = "bpf_htab",
-	.map_btf_id = &htab_percpu_map_btf_id,
+	.map_btf_id = &htab_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
-static int htab_lru_percpu_map_btf_id;
 const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = htab_map_alloc_check,
@@ -2319,8 +2314,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_lru_percpu),
-	.map_btf_name = "bpf_htab",
-	.map_btf_id = &htab_lru_percpu_map_btf_id,
+	.map_btf_id = &htab_map_btf_ids[0],
 	.iter_seq_info = &iter_seq_info,
 };
 
@@ -2444,7 +2438,6 @@ static void htab_of_map_free(struct bpf_map *map)
 	fd_htab_map_free(map);
 }
 
-static int htab_of_maps_map_btf_id;
 const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_alloc_check = fd_htab_map_alloc_check,
 	.map_alloc = htab_of_map_alloc,
@@ -2457,6 +2450,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = htab_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_htab",
-	.map_btf_id = &htab_of_maps_map_btf_id,
+	.map_btf_id = &htab_map_btf_ids[0],
 };
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 497916060ac7..8654fc97f5fe 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,6 +9,7 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -446,7 +447,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
-static int cgroup_storage_map_btf_id;
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+		   bpf_cgroup_storage_map)
 const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_alloc = cgroup_storage_map_alloc,
 	.map_free = cgroup_storage_map_free,
@@ -456,8 +458,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_delete_elem = cgroup_storage_delete_elem,
 	.map_check_btf = cgroup_storage_check_btf,
 	.map_seq_show_elem = cgroup_storage_seq_show_elem,
-	.map_btf_name = "bpf_cgroup_storage_map",
-	.map_btf_id = &cgroup_storage_map_btf_id,
+	.map_btf_id = &cgroup_storage_map_btf_ids[0],
 };
 
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 5763cc7ac4f1..f0d05a3cc4b9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -14,6 +14,7 @@
 #include <linux/vmalloc.h>
 #include <net/ipv6.h>
 #include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
 
 /* Intermediate node */
 #define LPM_TREE_NODE_FLAG_IM BIT(0)
@@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map,
 	       -EINVAL : 0;
 }
 
-static int trie_map_btf_id;
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
 const struct bpf_map_ops trie_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = trie_alloc,
@@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = {
 	.map_update_batch = generic_map_update_batch,
 	.map_delete_batch = generic_map_delete_batch,
 	.map_check_btf = trie_check_btf,
-	.map_btf_name = "lpm_trie",
-	.map_btf_id = &trie_map_btf_id,
+	.map_btf_id = &trie_map_btf_ids[0],
 };
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f9c734aaa990..a1c0794ae49d 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -8,6 +8,7 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/capability.h>
+#include <linux/btf_ids.h>
 #include "percpu_freelist.h"
 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
@@ -247,7 +248,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
 	return -EINVAL;
 }
 
-static int queue_map_btf_id;
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
 const struct bpf_map_ops queue_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = queue_stack_map_alloc_check,
@@ -260,11 +261,9 @@ const struct bpf_map_ops queue_map_ops = {
 	.map_pop_elem = queue_map_pop_elem,
 	.map_peek_elem = queue_map_peek_elem,
 	.map_get_next_key = queue_stack_map_get_next_key,
-	.map_btf_name = "bpf_queue_stack",
-	.map_btf_id = &queue_map_btf_id,
+	.map_btf_id = &queue_map_btf_ids[0],
 };
 
-static int stack_map_btf_id;
 const struct bpf_map_ops stack_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = queue_stack_map_alloc_check,
@@ -277,6 +276,5 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_pop_elem = stack_map_pop_elem,
 	.map_peek_elem = stack_map_peek_elem,
 	.map_get_next_key = queue_stack_map_get_next_key,
-	.map_btf_name = "bpf_queue_stack",
-	.map_btf_id = &stack_map_btf_id,
+	.map_btf_id = &queue_map_btf_ids[0],
 };
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 8251243022a2..e2618fb5870e 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -6,6 +6,7 @@
 #include <linux/err.h>
 #include <linux/sock_diag.h>
 #include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
 
 struct reuseport_array {
 	struct bpf_map map;
@@ -337,7 +338,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
 	return 0;
 }
 
-static int reuseport_array_map_btf_id;
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
 const struct bpf_map_ops reuseport_array_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc_check = reuseport_array_alloc_check,
@@ -346,6 +347,5 @@ const struct bpf_map_ops reuseport_array_ops = {
 	.map_lookup_elem = reuseport_array_lookup_elem,
 	.map_get_next_key = reuseport_array_get_next_key,
 	.map_delete_elem = reuseport_array_delete_elem,
-	.map_btf_name = "reuseport_array",
-	.map_btf_id = &reuseport_array_map_btf_id,
+	.map_btf_id = &reuseport_array_map_btf_ids[0],
 };
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 5173fd37590f..311264ab80c4 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -10,6 +10,7 @@
 #include <linux/poll.h>
 #include <linux/kmemleak.h>
 #include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
 
 #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
 
@@ -263,7 +264,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
 	return 0;
 }
 
-static int ringbuf_map_btf_id;
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
 const struct bpf_map_ops ringbuf_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = ringbuf_map_alloc,
@@ -274,8 +275,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
 	.map_get_next_key = ringbuf_map_get_next_key,
-	.map_btf_name = "bpf_ringbuf_map",
-	.map_btf_id = &ringbuf_map_btf_id,
+	.map_btf_id = &ringbuf_map_btf_ids[0],
 };
 
 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 1dd5266fbebb..1adbe67cdb95 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -654,7 +654,7 @@ static void stack_map_free(struct bpf_map *map)
 	put_callchain_buffers();
 }
 
-static int stack_trace_map_btf_id;
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
 const struct bpf_map_ops stack_trace_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = stack_map_alloc,
@@ -664,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
-	.map_btf_name = "bpf_stack_map",
-	.map_btf_id = &stack_trace_map_btf_id,
+	.map_btf_id = &stack_trace_map_btf_ids[0],
 };
-- 
cgit v1.2.3


From 217d8c05ec6295947b0b71bc784012d112f0032e Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 7 Feb 2022 05:12:16 -0800
Subject: tracing: Cleanup double word in comment

Remove the second 'is' and 'to'.

Link: https://lkml.kernel.org/r/20220207131216.2059997-1-trix@redhat.com

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/pid_list.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index a2ef1d18126a..95106d02b32d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1,
 /**
  * trace_pid_list_is_set - test if the pid is set in the list
  * @pid_list: The pid list to test
- * @pid: The pid to to see if set in the list.
+ * @pid: The pid to see if set in the list.
  *
- * Tests if @pid is is set in the @pid_list. This is usually called
+ * Tests if @pid is set in the @pid_list. This is usually called
  * from the scheduler when a task is scheduled. Its pid is checked
  * if it should be traced or not.
  *
-- 
cgit v1.2.3


From b8cc44a4d3c19296dfd1be1a018a8523e09ab919 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 4 Feb 2022 16:12:04 -0600
Subject: tracing: Remove logic for registering multiple event triggers at a
 time

Code for registering triggers assumes it's possible to register more
than one trigger at a time.  In fact, it's unimplemented and there
doesn't seem to be a reason to do that.

Remove the n_registered param from event_trigger_register() and fix up
callers.

Doing so simplifies the logic in event_trigger_register to the point
that it just becomes a wrapper calling event_command.reg().

It also removes the problematic call to event_command.unreg() in case
of failure.  A new function, event_trigger_unregister() is also added
for callers to call themselves.

The changes to trace_events_hist.c simply allow compilation; a
separate patch follows which updates the hist triggers to work
correctly with the new changes.

Link: https://lkml.kernel.org/r/6149fec7a139d93e84fa4535672fb5bef88006b0.1644010575.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.h                |  9 ++--
 kernel/trace/trace_events_hist.c    | 17 +++----
 kernel/trace/trace_events_trigger.c | 96 +++++++++++++------------------------
 3 files changed, 45 insertions(+), 77 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 07d990270e2a..2bc8036b0659 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1629,10 +1629,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops,
 extern int event_trigger_register(struct event_command *cmd_ops,
 				  struct trace_event_file *file,
 				  char *glob,
-				  char *cmd,
-				  char *trigger,
-				  struct event_trigger_data *trigger_data,
-				  int *n_registered);
+				  struct event_trigger_data *trigger_data);
+extern void event_trigger_unregister(struct event_command *cmd_ops,
+				     struct trace_event_file *file,
+				     char *glob,
+				     struct event_trigger_data *trigger_data);
 
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 44db5ba9cabb..b18d00905eae 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6287,7 +6287,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 			goto out_free;
 		}
 
-		cmd_ops->unreg(glob+1, trigger_data, file);
+		event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
 		se_name = trace_event_name(file->event_call);
 		se = find_synth_event(se_name);
 		if (se)
@@ -6296,13 +6296,10 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 		goto out_free;
 	}
 
-	ret = cmd_ops->reg(glob, trigger_data, file);
-	/*
-	 * The above returns on success the # of triggers registered,
-	 * but if it didn't register any it returns zero.  Consider no
-	 * triggers registered a failure too.
-	 */
-	if (!ret) {
+	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+	if (ret)
+		goto out_free;
+	if (ret == 0) {
 		if (!(attrs->pause || attrs->cont || attrs->clear))
 			ret = -ENOENT;
 		goto out_free;
@@ -6331,15 +6328,13 @@ enable:
 	se = find_synth_event(se_name);
 	if (se)
 		se->ref++;
-	/* Just return zero, not the number of registered triggers */
-	ret = 0;
  out:
 	if (ret == 0)
 		hist_err_clear();
 
 	return ret;
  out_unreg:
-	cmd_ops->unreg(glob+1, trigger_data, file);
+	event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
  out_free:
 	if (cmd_ops->set_filter)
 		cmd_ops->set_filter(NULL, trigger_data, NULL);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 7eb9d04f1c2e..ea13679054a8 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -587,13 +587,12 @@ static int register_trigger(char *glob,
 	}
 
 	list_add_rcu(&data->list, &file->triggers);
-	ret++;
 
 	update_cond_flag(file);
-	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+	ret = trace_event_trigger_enable_disable(file, 1);
+	if (ret < 0) {
 		list_del_rcu(&data->list);
 		update_cond_flag(file);
-		ret--;
 	}
 out:
 	return ret;
@@ -927,48 +926,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops,
  * @cmd_ops: The event_command operations for the trigger
  * @file: The event file for the trigger's event
  * @glob: The trigger command string, with optional remove(!) operator
- * @cmd: The cmd string
- * @param: The param string
  * @trigger_data: The trigger_data for the trigger
- * @n_registered: optional outparam, the number of triggers registered
  *
  * Register an event trigger.  The @cmd_ops are used to call the
- * cmd_ops->reg() function which actually does the registration. The
- * cmd_ops->reg() function returns the number of triggers registered,
- * which is assigned to n_registered, if n_registered is non-NULL.
+ * cmd_ops->reg() function which actually does the registration.
  *
  * Return: 0 on success, errno otherwise
  */
 int event_trigger_register(struct event_command *cmd_ops,
 			   struct trace_event_file *file,
 			   char *glob,
-			   char *cmd,
-			   char *param,
-			   struct event_trigger_data *trigger_data,
-			   int *n_registered)
+			   struct event_trigger_data *trigger_data)
 {
-	int ret;
-
-	if (n_registered)
-		*n_registered = 0;
-
-	ret = cmd_ops->reg(glob, trigger_data, file);
-	/*
-	 * The above returns on success the # of functions enabled,
-	 * but if it didn't find any functions it returns zero.
-	 * Consider no functions a failure too.
-	 */
-	if (!ret) {
-		cmd_ops->unreg(glob, trigger_data, file);
-		ret = -ENOENT;
-	} else if (ret > 0) {
-		if (n_registered)
-			*n_registered = ret;
-		/* Just return zero, not the number of enabled functions */
-		ret = 0;
-	}
+	return cmd_ops->reg(glob, trigger_data, file);
+}
 
-	return ret;
+/**
+ * event_trigger_unregister - unregister an event trigger
+ * @cmd_ops: The event_command operations for the trigger
+ * @file: The event file for the trigger's event
+ * @glob: The trigger command string, with optional remove(!) operator
+ * @trigger_data: The trigger_data for the trigger
+ *
+ * Unregister an event trigger.  The @cmd_ops are used to call the
+ * cmd_ops->unreg() function which actually does the unregistration.
+ */
+void event_trigger_unregister(struct event_command *cmd_ops,
+			      struct trace_event_file *file,
+			      char *glob,
+			      struct event_trigger_data *trigger_data)
+{
+	cmd_ops->unreg(glob, trigger_data, file);
 }
 
 /*
@@ -1027,7 +1015,7 @@ event_trigger_parse(struct event_command *cmd_ops,
 	INIT_LIST_HEAD(&trigger_data->named_list);
 
 	if (glob[0] == '!') {
-		cmd_ops->unreg(glob+1, trigger_data, file);
+		event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
 		kfree(trigger_data);
 		ret = 0;
 		goto out;
@@ -1062,17 +1050,10 @@ event_trigger_parse(struct event_command *cmd_ops,
  out_reg:
 	/* Up the trigger_data count to make sure reg doesn't free it on failure */
 	event_trigger_init(trigger_ops, trigger_data);
-	ret = cmd_ops->reg(glob, trigger_data, file);
-	/*
-	 * The above returns on success the # of functions enabled,
-	 * but if it didn't find any functions it returns zero.
-	 * Consider no functions a failure too.
-	 */
-	if (!ret) {
-		cmd_ops->unreg(glob, trigger_data, file);
-		ret = -ENOENT;
-	} else if (ret > 0)
-		ret = 0;
+
+	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+	if (ret)
+		goto out_free;
 
 	/* Down the counter of trigger_data or free it if not used anymore */
 	event_trigger_free(trigger_ops, trigger_data);
@@ -1854,7 +1835,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	trigger_data->private_data = enable_data;
 
 	if (glob[0] == '!') {
-		cmd_ops->unreg(glob+1, trigger_data, file);
+		event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
 		kfree(trigger_data);
 		kfree(enable_data);
 		ret = 0;
@@ -1901,19 +1882,11 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	ret = trace_event_enable_disable(event_enable_file, 1, 1);
 	if (ret < 0)
 		goto out_put;
-	ret = cmd_ops->reg(glob, trigger_data, file);
-	/*
-	 * The above returns on success the # of functions enabled,
-	 * but if it didn't find any functions it returns zero.
-	 * Consider no functions a failure too.
-	 */
-	if (!ret) {
-		ret = -ENOENT;
-		goto out_disable;
-	} else if (ret < 0)
+
+	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+	if (ret)
 		goto out_disable;
-	/* Just return zero, not the number of enabled functions */
-	ret = 0;
+
 	event_trigger_free(trigger_ops, trigger_data);
  out:
 	return ret;
@@ -1959,13 +1932,12 @@ int event_enable_register_trigger(char *glob,
 	}
 
 	list_add_rcu(&data->list, &file->triggers);
-	ret++;
 
 	update_cond_flag(file);
-	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+	ret = trace_event_trigger_enable_disable(file, 1);
+	if (ret < 0) {
 		list_del_rcu(&data->list);
 		update_cond_flag(file);
-		ret--;
 	}
 out:
 	return ret;
-- 
cgit v1.2.3


From 476705419518a2f383b6695782ba1f285a2959b9 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 4 Feb 2022 16:12:05 -0600
Subject: tracing: Remove redundant trigger_ops params

Since event_trigger_data contains the .ops trigger_ops field, there's
no reason to pass the trigger_ops separately. Remove it as a param
from functions whenever event_trigger_data is passed.

Link: https://lkml.kernel.org/r/9856c9bc81bde57077f5b8d6f8faa47156c6354a.1644010575.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.h                | 14 ++++-------
 kernel/trace/trace_eprobe.c         |  7 ++----
 kernel/trace/trace_events_hist.c    | 29 ++++++++++-------------
 kernel/trace/trace_events_trigger.c | 46 +++++++++++++++----------------------
 4 files changed, 36 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bc8036b0659..f3e4cd1bb60e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1573,10 +1573,8 @@ struct enable_trigger_data {
 };
 
 extern int event_enable_trigger_print(struct seq_file *m,
-				      struct event_trigger_ops *ops,
-				      struct event_trigger_data *data);
-extern void event_enable_trigger_free(struct event_trigger_ops *ops,
 				      struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_data *data);
 extern int event_enable_trigger_parse(struct event_command *cmd_ops,
 				      struct trace_event_file *file,
 				      char *glob, char *cmd, char *param);
@@ -1587,8 +1585,7 @@ extern void event_enable_unregister_trigger(char *glob,
 					    struct event_trigger_data *test,
 					    struct trace_event_file *file);
 extern void trigger_data_free(struct event_trigger_data *data);
-extern int event_trigger_init(struct event_trigger_ops *ops,
-			      struct event_trigger_data *data);
+extern int event_trigger_init(struct event_trigger_data *data);
 extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
 					      int trigger_enable);
 extern void update_cond_flag(struct trace_event_file *file);
@@ -1687,12 +1684,9 @@ struct event_trigger_ops {
 					   struct trace_buffer *buffer,
 					   void *rec,
 					   struct ring_buffer_event *rbe);
-	int			(*init)(struct event_trigger_ops *ops,
-					struct event_trigger_data *data);
-	void			(*free)(struct event_trigger_ops *ops,
-					struct event_trigger_data *data);
+	int			(*init)(struct event_trigger_data *data);
+	void			(*free)(struct event_trigger_data *data);
 	int			(*print)(struct seq_file *m,
-					 struct event_trigger_ops *ops,
 					 struct event_trigger_data *data);
 };
 
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 541aa13581b9..9b3dfc9da067 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -511,20 +511,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
  * functions are just stubs to fulfill what is needed to use the trigger
  * infrastructure.
  */
-static int eprobe_trigger_init(struct event_trigger_ops *ops,
-			       struct event_trigger_data *data)
+static int eprobe_trigger_init(struct event_trigger_data *data)
 {
 	return 0;
 }
 
-static void eprobe_trigger_free(struct event_trigger_ops *ops,
-				struct event_trigger_data *data)
+static void eprobe_trigger_free(struct event_trigger_data *data)
 {
 
 }
 
 static int eprobe_trigger_print(struct seq_file *m,
-				struct event_trigger_ops *ops,
 				struct event_trigger_data *data)
 {
 	/* Do not print eprobe event triggers */
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index b18d00905eae..40a7e5582dfb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5252,7 +5252,7 @@ static void hist_trigger_show(struct seq_file *m,
 		seq_puts(m, "\n\n");
 
 	seq_puts(m, "# event histogram\n#\n# trigger info: ");
-	data->ops->print(m, data->ops, data);
+	data->ops->print(m, data);
 	seq_puts(m, "#\n\n");
 
 	hist_data = data->private_data;
@@ -5484,7 +5484,7 @@ static void hist_trigger_debug_show(struct seq_file *m,
 		seq_puts(m, "\n\n");
 
 	seq_puts(m, "# event histogram\n#\n# trigger info: ");
-	data->ops->print(m, data->ops, data);
+	data->ops->print(m, data);
 	seq_puts(m, "#\n\n");
 
 	hist_data = data->private_data;
@@ -5621,7 +5621,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 }
 
 static int event_hist_trigger_print(struct seq_file *m,
-				    struct event_trigger_ops *ops,
 				    struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
@@ -5729,8 +5728,7 @@ static int event_hist_trigger_print(struct seq_file *m,
 	return 0;
 }
 
-static int event_hist_trigger_init(struct event_trigger_ops *ops,
-				   struct event_trigger_data *data)
+static int event_hist_trigger_init(struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
 
@@ -5758,8 +5756,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
 	}
 }
 
-static void event_hist_trigger_free(struct event_trigger_ops *ops,
-				    struct event_trigger_data *data)
+static void event_hist_trigger_free(struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
 
@@ -5788,25 +5785,23 @@ static struct event_trigger_ops event_hist_trigger_ops = {
 	.free			= event_hist_trigger_free,
 };
 
-static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
-					 struct event_trigger_data *data)
+static int event_hist_trigger_named_init(struct event_trigger_data *data)
 {
 	data->ref++;
 
 	save_named_trigger(data->named_data->name, data);
 
-	event_hist_trigger_init(ops, data->named_data);
+	event_hist_trigger_init(data->named_data);
 
 	return 0;
 }
 
-static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
-					  struct event_trigger_data *data)
+static void event_hist_trigger_named_free(struct event_trigger_data *data)
 {
 	if (WARN_ON_ONCE(data->ref <= 0))
 		return;
 
-	event_hist_trigger_free(ops, data->named_data);
+	event_hist_trigger_free(data->named_data);
 
 	data->ref--;
 	if (!data->ref) {
@@ -5993,7 +5988,7 @@ static int hist_register_trigger(char *glob,
 	}
 
 	if (data->ops->init) {
-		ret = data->ops->init(data->ops, data);
+		ret = data->ops->init(data);
 		if (ret < 0)
 			goto out;
 	}
@@ -6111,7 +6106,7 @@ static void hist_unregister_trigger(char *glob,
 	}
 
 	if (unregistered && test->ops->free)
-		test->ops->free(test->ops, test);
+		test->ops->free(test);
 
 	if (hist_data->enable_timestamps) {
 		if (!hist_data->remove || unregistered)
@@ -6164,7 +6159,7 @@ static void hist_unreg_all(struct trace_event_file *file)
 			if (hist_data->enable_timestamps)
 				tracing_set_filter_buffering(file->tr, false);
 			if (test->ops->free)
-				test->ops->free(test->ops, test);
+				test->ops->free(test);
 		}
 	}
 }
@@ -6458,7 +6453,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file)
 			update_cond_flag(file);
 			trace_event_trigger_enable_disable(file, 0);
 			if (test->ops->free)
-				test->ops->free(test->ops, test);
+				test->ops->free(test);
 		}
 	}
 }
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index ea13679054a8..32202175875d 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -188,7 +188,7 @@ static int trigger_show(struct seq_file *m, void *v)
 	}
 
 	data = list_entry(v, struct event_trigger_data, list);
-	data->ops->print(m, data->ops, data);
+	data->ops->print(m, data);
 
 	return 0;
 }
@@ -432,7 +432,6 @@ event_trigger_print(const char *name, struct seq_file *m,
 
 /**
  * event_trigger_init - Generic event_trigger_ops @init implementation
- * @ops: The trigger ops associated with the trigger
  * @data: Trigger-specific data
  *
  * Common implementation of event trigger initialization.
@@ -442,8 +441,7 @@ event_trigger_print(const char *name, struct seq_file *m,
  *
  * Return: 0 on success, errno otherwise
  */
-int event_trigger_init(struct event_trigger_ops *ops,
-		       struct event_trigger_data *data)
+int event_trigger_init(struct event_trigger_data *data)
 {
 	data->ref++;
 	return 0;
@@ -451,7 +449,6 @@ int event_trigger_init(struct event_trigger_ops *ops,
 
 /**
  * event_trigger_free - Generic event_trigger_ops @free implementation
- * @ops: The trigger ops associated with the trigger
  * @data: Trigger-specific data
  *
  * Common implementation of event trigger de-initialization.
@@ -460,8 +457,7 @@ int event_trigger_init(struct event_trigger_ops *ops,
  * implementations.
  */
 static void
-event_trigger_free(struct event_trigger_ops *ops,
-		   struct event_trigger_data *data)
+event_trigger_free(struct event_trigger_data *data)
 {
 	if (WARN_ON_ONCE(data->ref <= 0))
 		return;
@@ -515,7 +511,7 @@ clear_event_triggers(struct trace_array *tr)
 			trace_event_trigger_enable_disable(file, 0);
 			list_del_rcu(&data->list);
 			if (data->ops->free)
-				data->ops->free(data->ops, data);
+				data->ops->free(data);
 		}
 	}
 }
@@ -581,7 +577,7 @@ static int register_trigger(char *glob,
 	}
 
 	if (data->ops->init) {
-		ret = data->ops->init(data->ops, data);
+		ret = data->ops->init(data);
 		if (ret < 0)
 			goto out;
 	}
@@ -629,7 +625,7 @@ static void unregister_trigger(char *glob,
 	}
 
 	if (unregistered && data->ops->free)
-		data->ops->free(data->ops, data);
+		data->ops->free(data);
 }
 
 /*
@@ -1049,14 +1045,14 @@ event_trigger_parse(struct event_command *cmd_ops,
 
  out_reg:
 	/* Up the trigger_data count to make sure reg doesn't free it on failure */
-	event_trigger_init(trigger_ops, trigger_data);
+	event_trigger_init(trigger_data);
 
 	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
 	if (ret)
 		goto out_free;
 
 	/* Down the counter of trigger_data or free it if not used anymore */
-	event_trigger_free(trigger_ops, trigger_data);
+	event_trigger_free(trigger_data);
  out:
 	return ret;
 
@@ -1382,16 +1378,14 @@ traceoff_count_trigger(struct event_trigger_data *data,
 }
 
 static int
-traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-		      struct event_trigger_data *data)
+traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data)
 {
 	return event_trigger_print("traceon", m, (void *)data->count,
 				   data->filter_str);
 }
 
 static int
-traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-		       struct event_trigger_data *data)
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
 {
 	return event_trigger_print("traceoff", m, (void *)data->count,
 				   data->filter_str);
@@ -1502,8 +1496,7 @@ register_snapshot_trigger(char *glob,
 }
 
 static int
-snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-		       struct event_trigger_data *data)
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
 {
 	return event_trigger_print("snapshot", m, (void *)data->count,
 				   data->filter_str);
@@ -1598,8 +1591,7 @@ stacktrace_count_trigger(struct event_trigger_data *data,
 }
 
 static int
-stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-			 struct event_trigger_data *data)
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
 {
 	return event_trigger_print("stacktrace", m, (void *)data->count,
 				   data->filter_str);
@@ -1689,7 +1681,6 @@ event_enable_count_trigger(struct event_trigger_data *data,
 }
 
 int event_enable_trigger_print(struct seq_file *m,
-			       struct event_trigger_ops *ops,
 			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
@@ -1714,8 +1705,7 @@ int event_enable_trigger_print(struct seq_file *m,
 	return 0;
 }
 
-void event_enable_trigger_free(struct event_trigger_ops *ops,
-			       struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1843,7 +1833,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	}
 
 	/* Up the trigger_data count to make sure nothing frees it on failure */
-	event_trigger_init(trigger_ops, trigger_data);
+	event_trigger_init(trigger_data);
 
 	if (trigger) {
 		number = strsep(&trigger, ":");
@@ -1887,7 +1877,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	if (ret)
 		goto out_disable;
 
-	event_trigger_free(trigger_ops, trigger_data);
+	event_trigger_free(trigger_data);
  out:
 	return ret;
 
@@ -1898,7 +1888,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
  out_free:
 	if (cmd_ops->set_filter)
 		cmd_ops->set_filter(NULL, trigger_data, NULL);
-	event_trigger_free(trigger_ops, trigger_data);
+	event_trigger_free(trigger_data);
 	kfree(enable_data);
 	goto out;
 }
@@ -1926,7 +1916,7 @@ int event_enable_register_trigger(char *glob,
 	}
 
 	if (data->ops->init) {
-		ret = data->ops->init(data->ops, data);
+		ret = data->ops->init(data);
 		if (ret < 0)
 			goto out;
 	}
@@ -1969,7 +1959,7 @@ void event_enable_unregister_trigger(char *glob,
 	}
 
 	if (unregistered && data->ops->free)
-		data->ops->free(data->ops, data);
+		data->ops->free(data);
 }
 
 static struct event_trigger_ops *
-- 
cgit v1.2.3


From e1f187d09e11f50a50cbb02ae277d4e8bdfc7db8 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 4 Feb 2022 16:12:06 -0600
Subject: tracing: Have existing event_command.parse() implementations use
 helpers

Simplify the existing event_command.parse() implementations by having
them make use of the helper functions previously introduced.

Link: https://lkml.kernel.org/r/b353e3427a81f9d3adafd98fd7d73e78a8209f43.1644010576.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.h                |   3 +-
 kernel/trace/trace_eprobe.c         |   3 +-
 kernel/trace/trace_events_hist.c    |  64 ++++++---------
 kernel/trace/trace_events_trigger.c | 150 ++++++++++--------------------------
 4 files changed, 69 insertions(+), 151 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f3e4cd1bb60e..ff816fb41e48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1577,7 +1577,8 @@ extern int event_enable_trigger_print(struct seq_file *m,
 extern void event_enable_trigger_free(struct event_trigger_data *data);
 extern int event_enable_trigger_parse(struct event_command *cmd_ops,
 				      struct trace_event_file *file,
-				      char *glob, char *cmd, char *param);
+				      char *glob, char *cmd,
+				      char *param_and_filter);
 extern int event_enable_register_trigger(char *glob,
 					 struct event_trigger_data *data,
 					 struct trace_event_file *file);
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 9b3dfc9da067..b045fa9f276c 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -546,7 +546,8 @@ static struct event_trigger_ops eprobe_trigger_ops = {
 
 static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops,
 				    struct trace_event_file *file,
-				    char *glob, char *cmd, char *param)
+				    char *glob, char *cmd,
+				    char *param_and_filter)
 {
 	return -1;
 }
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 40a7e5582dfb..1b71bdb0bcb1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2785,7 +2785,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data,
 static struct event_command trigger_hist_cmd;
 static int event_hist_trigger_parse(struct event_command *cmd_ops,
 				    struct trace_event_file *file,
-				    char *glob, char *cmd, char *param);
+				    char *glob, char *cmd,
+				    char *param_and_filter);
 
 static bool compatible_keys(struct hist_trigger_data *target_hist_data,
 			    struct hist_trigger_data *hist_data,
@@ -6166,17 +6167,17 @@ static void hist_unreg_all(struct trace_event_file *file)
 
 static int event_hist_trigger_parse(struct event_command *cmd_ops,
 				    struct trace_event_file *file,
-				    char *glob, char *cmd, char *param)
+				    char *glob, char *cmd,
+				    char *param_and_filter)
 {
 	unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
 	struct event_trigger_data *trigger_data;
 	struct hist_trigger_attrs *attrs;
-	struct event_trigger_ops *trigger_ops;
 	struct hist_trigger_data *hist_data;
+	char *param, *filter, *p, *start;
 	struct synth_event *se;
 	const char *se_name;
-	bool remove = false;
-	char *trigger, *p, *start;
+	bool remove;
 	int ret = 0;
 
 	lockdep_assert_held(&event_mutex);
@@ -6185,31 +6186,30 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 
 	if (strlen(glob)) {
 		hist_err_clear();
-		last_cmd_set(file, param);
+		last_cmd_set(file, param_and_filter);
 	}
 
-	if (!param)
-		return -EINVAL;
+	remove = event_trigger_check_remove(glob);
 
-	if (glob[0] == '!')
-		remove = true;
+	if (event_trigger_empty_param(param_and_filter))
+		return -EINVAL;
 
 	/*
 	 * separate the trigger from the filter (k:v [if filter])
 	 * allowing for whitespace in the trigger
 	 */
-	p = trigger = param;
+	p = param = param_and_filter;
 	do {
 		p = strstr(p, "if");
 		if (!p)
 			break;
-		if (p == param)
+		if (p == param_and_filter)
 			return -EINVAL;
 		if (*(p - 1) != ' ' && *(p - 1) != '\t') {
 			p++;
 			continue;
 		}
-		if (p >= param + strlen(param) - (sizeof("if") - 1) - 1)
+		if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1)
 			return -EINVAL;
 		if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') {
 			p++;
@@ -6219,24 +6219,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 	} while (1);
 
 	if (!p)
-		param = NULL;
+		filter = NULL;
 	else {
 		*(p - 1) = '\0';
-		param = strstrip(p);
-		trigger = strstrip(trigger);
+		filter = strstrip(p);
+		param = strstrip(param);
 	}
 
 	/*
 	 * To simplify arithmetic expression parsing, replace occurrences of
 	 * '.sym-offset' modifier with '.symXoffset'
 	 */
-	start = strstr(trigger, ".sym-offset");
+	start = strstr(param, ".sym-offset");
 	while (start) {
 		*(start + 4) = 'X';
 		start = strstr(start + 11, ".sym-offset");
 	}
 
-	attrs = parse_hist_trigger_attrs(file->tr, trigger);
+	attrs = parse_hist_trigger_attrs(file->tr, param);
 	if (IS_ERR(attrs))
 		return PTR_ERR(attrs);
 
@@ -6249,29 +6249,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 		return PTR_ERR(hist_data);
 	}
 
-	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
-	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
 	if (!trigger_data) {
 		ret = -ENOMEM;
 		goto out_free;
 	}
 
-	trigger_data->count = -1;
-	trigger_data->ops = trigger_ops;
-	trigger_data->cmd_ops = cmd_ops;
-
-	INIT_LIST_HEAD(&trigger_data->list);
-	RCU_INIT_POINTER(trigger_data->filter, NULL);
-
-	trigger_data->private_data = hist_data;
-
-	/* if param is non-empty, it's supposed to be a filter */
-	if (param && cmd_ops->set_filter) {
-		ret = cmd_ops->set_filter(param, trigger_data, file);
-		if (ret < 0)
-			goto out_free;
-	}
+	ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
+	if (ret < 0)
+		goto out_free;
 
 	if (remove) {
 		if (!have_hist_trigger_match(trigger_data, file))
@@ -6298,8 +6284,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 		if (!(attrs->pause || attrs->cont || attrs->clear))
 			ret = -ENOENT;
 		goto out_free;
-	} else if (ret < 0)
-		goto out_free;
+	}
 
 	if (get_named_trigger_data(trigger_data))
 		goto enable;
@@ -6331,8 +6316,7 @@ enable:
  out_unreg:
 	event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
  out_free:
-	if (cmd_ops->set_filter)
-		cmd_ops->set_filter(NULL, trigger_data, NULL);
+	event_trigger_reset_filter(cmd_ops, trigger_data);
 
 	remove_hist_vars(hist_data);
 
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 32202175875d..62c44dab8f46 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -965,7 +965,7 @@ void event_trigger_unregister(struct event_command *cmd_ops,
  * @file: The trace_event_file associated with the event
  * @glob: The raw string used to register the trigger
  * @cmd: The cmd portion of the string used to register the trigger
- * @param: The params portion of the string used to register the trigger
+ * @param_and_filter: The param and filter portion of the string used to register the trigger
  *
  * Common implementation for event command parsing and trigger
  * instantiation.
@@ -978,72 +978,39 @@ void event_trigger_unregister(struct event_command *cmd_ops,
 static int
 event_trigger_parse(struct event_command *cmd_ops,
 		    struct trace_event_file *file,
-		    char *glob, char *cmd, char *param)
+		    char *glob, char *cmd, char *param_and_filter)
 {
 	struct event_trigger_data *trigger_data;
-	struct event_trigger_ops *trigger_ops;
-	char *trigger = NULL;
-	char *number;
+	char *param, *filter;
+	bool remove;
 	int ret;
 
-	/* separate the trigger from the filter (t:n [if filter]) */
-	if (param && isdigit(param[0])) {
-		trigger = strsep(&param, " \t");
-		if (param) {
-			param = skip_spaces(param);
-			if (!*param)
-				param = NULL;
-		}
-	}
+	remove = event_trigger_check_remove(glob);
 
-	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+	ret = event_trigger_separate_filter(param_and_filter, &param, &filter, false);
+	if (ret)
+		return ret;
 
 	ret = -ENOMEM;
-	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
 	if (!trigger_data)
 		goto out;
 
-	trigger_data->count = -1;
-	trigger_data->ops = trigger_ops;
-	trigger_data->cmd_ops = cmd_ops;
-	trigger_data->private_data = file;
-	INIT_LIST_HEAD(&trigger_data->list);
-	INIT_LIST_HEAD(&trigger_data->named_list);
-
-	if (glob[0] == '!') {
+	if (remove) {
 		event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
 		kfree(trigger_data);
 		ret = 0;
 		goto out;
 	}
 
-	if (trigger) {
-		number = strsep(&trigger, ":");
-
-		ret = -EINVAL;
-		if (!strlen(number))
-			goto out_free;
-
-		/*
-		 * We use the callback data field (which is a pointer)
-		 * as our counter.
-		 */
-		ret = kstrtoul(number, 0, &trigger_data->count);
-		if (ret)
-			goto out_free;
-	}
-
-	if (!param) /* if param is non-empty, it's supposed to be a filter */
-		goto out_reg;
-
-	if (!cmd_ops->set_filter)
-		goto out_reg;
+	ret = event_trigger_parse_num(param, trigger_data);
+	if (ret)
+		goto out_free;
 
-	ret = cmd_ops->set_filter(param, trigger_data, file);
+	ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
 	if (ret < 0)
 		goto out_free;
 
- out_reg:
 	/* Up the trigger_data count to make sure reg doesn't free it on failure */
 	event_trigger_init(trigger_data);
 
@@ -1057,8 +1024,7 @@ event_trigger_parse(struct event_command *cmd_ops,
 	return ret;
 
  out_free:
-	if (cmd_ops->set_filter)
-		cmd_ops->set_filter(NULL, trigger_data, NULL);
+	event_trigger_reset_filter(cmd_ops, trigger_data);
 	kfree(trigger_data);
 	goto out;
 }
@@ -1752,39 +1718,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 
 int event_enable_trigger_parse(struct event_command *cmd_ops,
 			       struct trace_event_file *file,
-			       char *glob, char *cmd, char *param)
+			       char *glob, char *cmd, char *param_and_filter)
 {
 	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
 	struct event_trigger_data *trigger_data;
-	struct event_trigger_ops *trigger_ops;
 	struct trace_array *tr = file->tr;
+	char *param, *filter;
+	bool enable, remove;
 	const char *system;
 	const char *event;
 	bool hist = false;
-	char *trigger;
-	char *number;
-	bool enable;
 	int ret;
 
-	if (!param)
-		return -EINVAL;
+	remove = event_trigger_check_remove(glob);
 
-	/* separate the trigger from the filter (s:e:n [if filter]) */
-	trigger = strsep(&param, " \t");
-	if (!trigger)
+	if (event_trigger_empty_param(param_and_filter))
 		return -EINVAL;
-	if (param) {
-		param = skip_spaces(param);
-		if (!*param)
-			param = NULL;
-	}
 
-	system = strsep(&trigger, ":");
-	if (!trigger)
+	ret = event_trigger_separate_filter(param_and_filter, &param, &filter, true);
+	if (ret)
+		return ret;
+
+	system = strsep(&param, ":");
+	if (!param)
 		return -EINVAL;
 
-	event = strsep(&trigger, ":");
+	event = strsep(&param, ":");
 
 	ret = -EINVAL;
 	event_enable_file = find_event_file(tr, system, event);
@@ -1800,31 +1760,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 #else
 	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
 #endif
-	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
-
 	ret = -ENOMEM;
-	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
-	if (!trigger_data)
-		goto out;
 
 	enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
-	if (!enable_data) {
-		kfree(trigger_data);
+	if (!enable_data)
 		goto out;
-	}
-
-	trigger_data->count = -1;
-	trigger_data->ops = trigger_ops;
-	trigger_data->cmd_ops = cmd_ops;
-	INIT_LIST_HEAD(&trigger_data->list);
-	RCU_INIT_POINTER(trigger_data->filter, NULL);
 
 	enable_data->hist = hist;
 	enable_data->enable = enable;
 	enable_data->file = event_enable_file;
-	trigger_data->private_data = enable_data;
 
-	if (glob[0] == '!') {
+	trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+	if (!trigger_data) {
+		kfree(enable_data);
+		goto out;
+	}
+
+	if (remove) {
 		event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
 		kfree(trigger_data);
 		kfree(enable_data);
@@ -1835,33 +1787,14 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	/* Up the trigger_data count to make sure nothing frees it on failure */
 	event_trigger_init(trigger_data);
 
-	if (trigger) {
-		number = strsep(&trigger, ":");
-
-		ret = -EINVAL;
-		if (!strlen(number))
-			goto out_free;
-
-		/*
-		 * We use the callback data field (which is a pointer)
-		 * as our counter.
-		 */
-		ret = kstrtoul(number, 0, &trigger_data->count);
-		if (ret)
-			goto out_free;
-	}
-
-	if (!param) /* if param is non-empty, it's supposed to be a filter */
-		goto out_reg;
-
-	if (!cmd_ops->set_filter)
-		goto out_reg;
+	ret = event_trigger_parse_num(param, trigger_data);
+	if (ret)
+		goto out_free;
 
-	ret = cmd_ops->set_filter(param, trigger_data, file);
+	ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data);
 	if (ret < 0)
 		goto out_free;
 
- out_reg:
 	/* Don't let event modules unload while probe registered */
 	ret = trace_event_try_get_ref(event_enable_file->event_call);
 	if (!ret) {
@@ -1880,16 +1813,15 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
 	event_trigger_free(trigger_data);
  out:
 	return ret;
-
  out_disable:
 	trace_event_enable_disable(event_enable_file, 0, 1);
  out_put:
 	trace_event_put_ref(event_enable_file->event_call);
  out_free:
-	if (cmd_ops->set_filter)
-		cmd_ops->set_filter(NULL, trigger_data, NULL);
+	event_trigger_reset_filter(cmd_ops, trigger_data);
 	event_trigger_free(trigger_data);
 	kfree(enable_data);
+
 	goto out;
 }
 
-- 
cgit v1.2.3


From a7e6b7dcfb19988ad2968a1fafd29b600abbf133 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 4 Feb 2022 16:12:07 -0600
Subject: tracing: Separate hist state updates from hist registration

hist_register_trigger() handles both new hist registration as well as
existing hist registration through event_command.reg().

Adding a new function, existing_hist_update_only(), that checks and
updates existing histograms and exits after doing so allows the
confusing logic in event_hist_trigger_parse() to be simplified.

Link: https://lkml.kernel.org/r/211b2cd3e3d7e00f4f8ad45ef8b33063da6a7e05.1644010576.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 66 +++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1b71bdb0bcb1..998dfe2162fc 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5929,6 +5929,48 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 	return true;
 }
 
+static bool existing_hist_update_only(char *glob,
+				      struct event_trigger_data *data,
+				      struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct event_trigger_data *test, *named_data = NULL;
+	bool updated = false;
+
+	if (!hist_data->attrs->pause && !hist_data->attrs->cont &&
+	    !hist_data->attrs->clear)
+		goto out;
+
+	if (hist_data->attrs->name) {
+		named_data = find_named_trigger(hist_data->attrs->name);
+		if (named_data) {
+			if (!hist_trigger_match(data, named_data, named_data,
+						true))
+				goto out;
+		}
+	}
+
+	if (hist_data->attrs->name && !named_data)
+		goto out;
+
+	list_for_each_entry(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test, named_data, false))
+				continue;
+			if (hist_data->attrs->pause)
+				test->paused = true;
+			else if (hist_data->attrs->cont)
+				test->paused = false;
+			else if (hist_data->attrs->clear)
+				hist_clear(test);
+			updated = true;
+			goto out;
+		}
+	}
+ out:
+	return updated;
+}
+
 static int hist_register_trigger(char *glob,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
@@ -5957,19 +5999,11 @@ static int hist_register_trigger(char *glob,
 
 	list_for_each_entry(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test, named_data, false))
-				continue;
-			if (hist_data->attrs->pause)
-				test->paused = true;
-			else if (hist_data->attrs->cont)
-				test->paused = false;
-			else if (hist_data->attrs->clear)
-				hist_clear(test);
-			else {
+			if (hist_trigger_match(data, test, named_data, false)) {
 				hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
 				ret = -EEXIST;
+				goto out;
 			}
-			goto out;
 		}
 	}
  new:
@@ -6008,8 +6042,6 @@ static int hist_register_trigger(char *glob,
 
 	if (named_data)
 		destroy_hist_data(hist_data);
-
-	ret++;
  out:
 	return ret;
 }
@@ -6277,14 +6309,12 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 		goto out_free;
 	}
 
-	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
-	if (ret)
+	if (existing_hist_update_only(glob, trigger_data, file))
 		goto out_free;
-	if (ret == 0) {
-		if (!(attrs->pause || attrs->cont || attrs->clear))
-			ret = -ENOENT;
+
+	ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+	if (ret < 0)
 		goto out_free;
-	}
 
 	if (get_named_trigger_data(trigger_data))
 		goto enable;
-- 
cgit v1.2.3


From cf2adec7479d875973fe10782f8bf20377628ef2 Mon Sep 17 00:00:00 2001
From: Oscar Shiang <oscar0225@livemail.tw>
Date: Thu, 17 Feb 2022 00:50:06 +0800
Subject: tracing: Fix inconsistent style of mini-HOWTO

Each description should start with a hyphen and a space. Insert
spaces to fix it.

Link: https://lkml.kernel.org/r/TYCP286MB19130AA4A9C6FC5A8793DED2A1359@TYCP286MB1913.JPNP286.PROD.OUTLOOK.COM

Signed-off-by: Oscar Shiang <oscar0225@livemail.tw>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4de111fa18f..4f9c499368c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5475,7 +5475,7 @@ static const char readme_msg[] =
 	"  error_log\t- error log for failed commands (that support it)\n"
 	"  buffer_size_kb\t- view and modify size of per cpu buffer\n"
 	"  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
-	"  trace_clock\t\t-change the clock used to order events\n"
+	"  trace_clock\t\t- change the clock used to order events\n"
 	"       local:   Per cpu clock but may not be synced across CPUs\n"
 	"      global:   Synced across CPUs but slows tracing down.\n"
 	"     counter:   Not a clock, but just an increment\n"
@@ -5484,7 +5484,7 @@ static const char readme_msg[] =
 #ifdef CONFIG_X86_64
 	"     x86-tsc:   TSC cycle counter\n"
 #endif
-	"\n  timestamp_mode\t-view the mode used to timestamp events\n"
+	"\n  timestamp_mode\t- view the mode used to timestamp events\n"
 	"       delta:   Delta difference against a buffer-wide timestamp\n"
 	"    absolute:   Absolute (standalone) timestamp\n"
 	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
-- 
cgit v1.2.3


From 3b57d8477cd0f427198930706958f1312cc3594b Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Fri, 18 Feb 2022 18:08:49 +0800
Subject: tracing: Fix kernel-doc

Fix the following W=1 kernel warnings:

kernel/trace/trace.c:1181: warning: expecting prototype for
tracing_snapshot_cond_data(). Prototype was for
tracing_cond_snapshot_data() instead.

Link: https://lkml.kernel.org/r/20220218100849.122038-1-jiapeng.chong@linux.alibaba.com

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f9c499368c4..aceeeea21c11 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1174,7 +1174,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
 EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
 
 /**
- * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * tracing_cond_snapshot_data - get the user data associated with a snapshot
  * @tr:		The tracing instance
  *
  * When the user enables a conditional snapshot using
-- 
cgit v1.2.3


From adaa0a9f06d127b6a0a1b929ec0971df40993a9b Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Sat, 2 Apr 2022 15:20:15 +0800
Subject: tracing: Fix tracing_map_sort_entries() kernel-doc comment

Add the description of @n_sort_keys and make @sort_key ->
@sort_keys in tracing_map_sort_entries() kernel-doc comment
to remove warnings found by running scripts/kernel-doc, which
is caused by using 'make W=1'.

kernel/trace/tracing_map.c:1073: warning: Function parameter or member
'sort_keys' not described in 'tracing_map_sort_entries'
kernel/trace/tracing_map.c:1073: warning: Function parameter or member
'n_sort_keys' not described in 'tracing_map_sort_entries'
kernel/trace/tracing_map.c:1073: warning: Excess function parameter
'sort_key' description in 'tracing_map_sort_entries'

Link: https://lkml.kernel.org/r/20220402072015.45864-1-yang.lee@linux.alibaba.com

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9628b5571846..9901708ce6b8 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map,
 /**
  * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
  * @map: The tracing_map
- * @sort_key: The sort key to use for sorting
+ * @sort_keys: The sort key to use for sorting
+ * @n_sort_keys: hitcount, always have at least one
  * @sort_entries: outval: pointer to allocated and sorted array of entries
  *
  * tracing_map_sort_entries() sorts the current set of entries in the
-- 
cgit v1.2.3


From cb1c45fb68b8a4285ccf750842b1136f26cfe267 Mon Sep 17 00:00:00 2001
From: Jeff Xie <xiehuan09@gmail.com>
Date: Sun, 10 Apr 2022 22:50:25 +0800
Subject: tracing: Make tp_printk work on syscall tracepoints

Currently the tp_printk option has no effect on syscall tracepoint.
When adding the kernel option parameter tp_printk, then:

echo 1 > /sys/kernel/debug/tracing/events/syscalls/enable

When running any application, no trace information is printed on the
terminal.

Now added printk for syscall tracepoints.

Link: https://lkml.kernel.org/r/20220410145025.681144-1-xiehuan09@gmail.com

Signed-off-by: Jeff Xie <xiehuan09@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f755bde42fd0..b69e207012c9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 			goto end;
 
 		/* parameter types */
-		if (tr->trace_flags & TRACE_ITER_VERBOSE)
+		if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
 			trace_seq_printf(s, "%s ", entry->types[i]);
 
 		/* parameter values */
@@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	struct trace_event_file *trace_file;
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
-	struct ring_buffer_event *event;
-	struct trace_buffer *buffer;
-	unsigned int trace_ctx;
+	struct trace_event_buffer fbuffer;
 	unsigned long args[6];
 	int syscall_nr;
 	int size;
@@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	trace_ctx = tracing_gen_ctx();
-
-	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
-			sys_data->enter_event->event.type, size, trace_ctx);
-	if (!event)
+	entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
+	if (!entry)
 		return;
 
-	entry = ring_buffer_event_data(event);
+	entry = ring_buffer_event_data(fbuffer.event);
 	entry->nr = syscall_nr;
 	syscall_get_arguments(current, regs, args);
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
-	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    trace_ctx);
+	trace_event_buffer_commit(&fbuffer);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	struct trace_event_file *trace_file;
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
-	struct ring_buffer_event *event;
-	struct trace_buffer *buffer;
-	unsigned int trace_ctx;
+	struct trace_event_buffer fbuffer;
 	int syscall_nr;
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
@@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	trace_ctx = tracing_gen_ctx();
-
-	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
-			sys_data->exit_event->event.type, sizeof(*entry),
-			trace_ctx);
-	if (!event)
+	entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
+	if (!entry)
 		return;
 
-	entry = ring_buffer_event_data(event);
+	entry = ring_buffer_event_data(fbuffer.event);
 	entry->nr = syscall_nr;
 	entry->ret = syscall_get_return_value(current, regs);
 
-	event_trigger_unlock_commit(trace_file, buffer, event, entry,
-				    trace_ctx);
+	trace_event_buffer_commit(&fbuffer);
 }
 
 static int reg_event_syscall_enter(struct trace_event_file *file,
-- 
cgit v1.2.3


From 97a5d2e5e35f119b6d7a37c7538773ae7af82dd5 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Mon, 18 Apr 2022 01:56:29 +0700
Subject: tracing: Return -EINVAL if WARN_ON(!glob) triggered in
 event_hist_trigger_parse()

If `WARN_ON(!glob)` is ever triggered, we will still continue executing
the next lines. This will trigger the more serious problem, a NULL
pointer dereference bug.

Just return -EINVAL if @glob is NULL.

Link: https://lkml.kernel.org/r/20220417185630.199062-2-ammarfaizi2@gnuweeb.org

Cc: Ingo Molnar <mingo@redhat.com>
Cc: GNU/Weeb Mailing List <gwml@vger.gnuweeb.org>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 998dfe2162fc..80c25be23c45 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6214,7 +6214,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 
 	lockdep_assert_held(&event_mutex);
 
-	WARN_ON(!glob);
+	if (WARN_ON(!glob))
+		return -EINVAL;
 
 	if (strlen(glob)) {
 		hist_err_clear();
-- 
cgit v1.2.3


From 69686fcbdcc0b6fed3b8d0907e641f282df2f827 Mon Sep 17 00:00:00 2001
From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Date: Mon, 18 Apr 2022 01:56:30 +0700
Subject: tracing: Change `if (strlen(glob))` to `if (glob[0])`

No need to traverse to the end of string. If the first byte is not a NUL
char, it's guaranteed `if (strlen(glob))` is true.

Link: https://lkml.kernel.org/r/20220417185630.199062-3-ammarfaizi2@gnuweeb.org

Cc: Ingo Molnar <mingo@redhat.com>
Cc: GNU/Weeb Mailing List <gwml@vger.gnuweeb.org>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 80c25be23c45..fe10179893c1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6217,7 +6217,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
 	if (WARN_ON(!glob))
 		return -EINVAL;
 
-	if (strlen(glob)) {
+	if (glob[0]) {
 		hist_err_clear();
 		last_cmd_set(file, param_and_filter);
 	}
-- 
cgit v1.2.3


From 12025abdc8539ed9d5014e2d647a3fd1bd3de5cd Mon Sep 17 00:00:00 2001
From: Jun Miao <jun.miao@intel.com>
Date: Tue, 19 Apr 2022 09:39:10 +0800
Subject: tracing: Fix sleeping function called from invalid context on RT
 kernel

When setting bootparams="trace_event=initcall:initcall_start tp_printk=1" in the
cmdline, the output_printk() was called, and the spin_lock_irqsave() was called in the
atomic and irq disable interrupt context suitation. On the PREEMPT_RT kernel,
these locks are replaced with sleepable rt-spinlock, so the stack calltrace will
be triggered.
Fix it by raw_spin_lock_irqsave when PREEMPT_RT and "trace_event=initcall:initcall_start
tp_printk=1" enabled.

 BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:46
 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0
 preempt_count: 2, expected: 0
 RCU nest depth: 0, expected: 0
 Preemption disabled at:
 [<ffffffff8992303e>] try_to_wake_up+0x7e/0xba0
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.1-rt17+ #19 34c5812404187a875f32bee7977f7367f9679ea7
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x60/0x8c
  dump_stack+0x10/0x12
  __might_resched.cold+0x11d/0x155
  rt_spin_lock+0x40/0x70
  trace_event_buffer_commit+0x2fa/0x4c0
  ? map_vsyscall+0x93/0x93
  trace_event_raw_event_initcall_start+0xbe/0x110
  ? perf_trace_initcall_finish+0x210/0x210
  ? probe_sched_wakeup+0x34/0x40
  ? ttwu_do_wakeup+0xda/0x310
  ? trace_hardirqs_on+0x35/0x170
  ? map_vsyscall+0x93/0x93
  do_one_initcall+0x217/0x3c0
  ? trace_event_raw_event_initcall_level+0x170/0x170
  ? push_cpu_stop+0x400/0x400
  ? cblist_init_generic+0x241/0x290
  kernel_init_freeable+0x1ac/0x347
  ? _raw_spin_unlock_irq+0x65/0x80
  ? rest_init+0xf0/0xf0
  kernel_init+0x1e/0x150
  ret_from_fork+0x22/0x30
  </TASK>

Link: https://lkml.kernel.org/r/20220419013910.894370-1-jun.miao@intel.com

Signed-off-by: Jun Miao <jun.miao@intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aceeeea21c11..27bb486c3f97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2835,7 +2835,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
 static DEFINE_MUTEX(tracepoint_printk_mutex);
 
 static void output_printk(struct trace_event_buffer *fbuffer)
@@ -2863,14 +2863,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
 
 	event = &fbuffer->trace_file->event_call->event;
 
-	spin_lock_irqsave(&tracepoint_iter_lock, flags);
+	raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
 	trace_seq_init(&iter->seq);
 	iter->ent = fbuffer->entry;
 	event_call->event.funcs->trace(iter, 0, event);
 	trace_seq_putc(&iter->seq, 0);
 	printk("%s", iter->seq.buffer);
 
-	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+	raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
 }
 
 int tracepoint_printk_sysctl(struct ctl_table *table, int write,
-- 
cgit v1.2.3


From 4ee51101e93f0c8d83876ae5c0c26ca14934629e Mon Sep 17 00:00:00 2001
From: Guo Zhengkui <guozhengkui@vivo.com>
Date: Sun, 24 Apr 2022 21:19:32 +0800
Subject: tracing: Use WARN instead of printk and WARN_ON

Use `WARN(cond, ...)` instead of `if (cond)` + `printk(...)` +
`WARN_ON(1)`.

Link: https://lkml.kernel.org/r/20220424131932.3606-1-guozhengkui@vivo.com

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Guo Zhengkui <guozhengkui@vivo.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8aa493d25c73..d89e3f7e26eb 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -778,9 +778,8 @@ int register_trace_event(struct trace_event *event)
 
 		list_add_tail(&event->list, list);
 
-	} else if (event->type > __TRACE_LAST_TYPE) {
-		printk(KERN_WARNING "Need to add type to trace.h\n");
-		WARN_ON(1);
+	} else if (WARN(event->type > __TRACE_LAST_TYPE,
+			"Need to add type to trace.h")) {
 		goto out;
 	} else {
 		/* Is this event already used */
@@ -1571,13 +1570,8 @@ __init static int init_events(void)
 
 	for (i = 0; events[i]; i++) {
 		event = events[i];
-
 		ret = register_trace_event(event);
-		if (!ret) {
-			printk(KERN_WARNING "event %d failed to register\n",
-			       event->type);
-			WARN_ON_ONCE(1);
-		}
+		WARN_ONCE(!ret, "event %d failed to register", event->type);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From ed888241a0ab9a3606bd986be6f0abafa1afdf19 Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Tue, 26 Apr 2022 15:06:28 +0800
Subject: ring-buffer: Simplify if-if to if-else

Use if and else instead of if(A) and if (!A).

Link: https://lkml.kernel.org/r/20220426070628.167565-1-wanjiabing@vivo.com

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05dfc7a12d3d..655d6db3e3c3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6011,10 +6011,10 @@ static __init int test_ringbuffer(void)
 		pr_info("        total events:   %ld\n", total_lost + total_read);
 		pr_info("  recorded len bytes:   %ld\n", total_len);
 		pr_info(" recorded size bytes:   %ld\n", total_size);
-		if (total_lost)
+		if (total_lost) {
 			pr_info(" With dropped events, record len and size may not match\n"
 				" alloced and written from above\n");
-		if (!total_lost) {
+		} else {
 			if (RB_WARN_ON(buffer, total_len != total_alloc ||
 				       total_size != total_written))
 				break;
-- 
cgit v1.2.3


From ef9188bcc6ca1d8a2ad83e826b548e6820721061 Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Tue, 26 Apr 2022 20:24:06 +0800
Subject: tracing: Avoid adding tracer option before update_tracer_options

To prepare for support asynchronous tracer_init_tracefs initcall,
avoid calling create_trace_option_files before __update_tracer_options.
Otherwise, create_trace_option_files will show warning because
some tracers in trace_types list are already in tr->topts.

For example, hwlat_tracer call register_tracer in late_initcall,
and global_trace.dir is already created in tracing_init_dentry,
hwlat_tracer will be put into tr->topts.
Then if the __update_tracer_options is executed after hwlat_tracer
registered, create_trace_option_files find that hwlat_tracer is
already in tr->topts.

Link: https://lkml.kernel.org/r/20220426122407.17042-2-mark-pk.tsai@mediatek.com

Link: https://lore.kernel.org/lkml/20220322133339.GA32582@xsang-OptiPlex-9020/
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 27bb486c3f97..7275173c55d0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6332,12 +6332,18 @@ static void tracing_set_nop(struct trace_array *tr)
 	tr->current_trace = &nop_trace;
 }
 
+static bool tracer_options_updated;
+
 static void add_tracer_options(struct trace_array *tr, struct tracer *t)
 {
 	/* Only enable if the directory has been created already. */
 	if (!tr->dir)
 		return;
 
+	/* Only create trace option files after update_tracer_options finish */
+	if (!tracer_options_updated)
+		return;
+
 	create_trace_option_files(tr, t);
 }
 
@@ -9176,6 +9182,7 @@ static void __update_tracer_options(struct trace_array *tr)
 static void update_tracer_options(struct trace_array *tr)
 {
 	mutex_lock(&trace_types_lock);
+	tracer_options_updated = true;
 	__update_tracer_options(tr);
 	mutex_unlock(&trace_types_lock);
 }
-- 
cgit v1.2.3


From 6621a7004684bfcff5af4c8e4d37989941f42a6b Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Tue, 26 Apr 2022 20:24:07 +0800
Subject: tracing: make tracer_init_tracefs initcall asynchronous

Move trace_eval_init() to subsys_initcall to make it start
earlier.
And to avoid tracer_init_tracefs being blocked by
trace_event_sem which trace_eval_init() hold [1],
queue tracer_init_tracefs() to eval_map_wq to let
the two works being executed sequentially.

It can speed up the initialization of kernel as result
of making tracer_init_tracefs asynchronous.

On my arm64 platform, it reduce ~20ms of 125ms which total
time do_initcalls spend.

Link: https://lkml.kernel.org/r/20220426122407.17042-3-mark-pk.tsai@mediatek.com

[1]: https://lore.kernel.org/r/68d7b3327052757d0cd6359a6c9015a85b437232.camel@pengutronix.de
Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7275173c55d0..400d3e9fe9ff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9615,6 +9615,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[];
 
 static struct workqueue_struct *eval_map_wq __initdata;
 static struct work_struct eval_map_work __initdata;
+static struct work_struct tracerfs_init_work __initdata;
 
 static void __init eval_map_work_func(struct work_struct *work)
 {
@@ -9640,6 +9641,8 @@ static int __init trace_eval_init(void)
 	return 0;
 }
 
+subsys_initcall(trace_eval_init);
+
 static int __init trace_eval_sync(void)
 {
 	/* Make sure the eval map updates are finished */
@@ -9722,15 +9725,8 @@ static struct notifier_block trace_module_nb = {
 };
 #endif /* CONFIG_MODULES */
 
-static __init int tracer_init_tracefs(void)
+static __init void tracer_init_tracefs_work_func(struct work_struct *work)
 {
-	int ret;
-
-	trace_access_lock_init();
-
-	ret = tracing_init_dentry();
-	if (ret)
-		return 0;
 
 	event_trace_init();
 
@@ -9752,8 +9748,6 @@ static __init int tracer_init_tracefs(void)
 	trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
 			NULL, &tracing_saved_tgids_fops);
 
-	trace_eval_init();
-
 	trace_create_eval_file(NULL);
 
 #ifdef CONFIG_MODULES
@@ -9768,6 +9762,24 @@ static __init int tracer_init_tracefs(void)
 	create_trace_instances(NULL);
 
 	update_tracer_options(&global_trace);
+}
+
+static __init int tracer_init_tracefs(void)
+{
+	int ret;
+
+	trace_access_lock_init();
+
+	ret = tracing_init_dentry();
+	if (ret)
+		return 0;
+
+	if (eval_map_wq) {
+		INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
+		queue_work(eval_map_wq, &tracerfs_init_work);
+	} else {
+		tracer_init_tracefs_work_func(NULL);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From b041b525dab95352fbd666b14dc73ab898df465f Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 10 Mar 2022 12:48:53 -0800
Subject: x86/split_lock: Make life miserable for split lockers

In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
said:

  Its's simply wishful thinking that stuff gets fixed because of a
  WARN_ONCE(). This has never worked. The only thing which works is to
  make stuff fail hard or slow it down in a way which makes it annoying
  enough to users to complain.

He was talking about WBINVD. But it made me think about how we use the
split lock detection feature in Linux.

Existing code has three options for applications:

 1) Don't enable split lock detection (allow arbitrary split locks)
 2) Warn once when a process uses split lock, but let the process
    keep running with split lock detection disabled
 3) Kill process that use split locks

Option 2 falls into the "wishful thinking" territory that Thomas warns does
nothing. But option 3 might not be viable in a situation with legacy
applications that need to run.

Hence make option 2 much stricter to "slow it down in a way which makes
it annoying".

Primary reason for this change is to provide better quality of service to
the rest of the applications running on the system. Internal testing shows
that even with many processes splitting locks, performance for the rest of
the system is much more responsive.

The new "warn" mode operates like this.  When an application tries to
execute a bus lock the #AC handler.

 1) Delays (interruptibly) 10 ms before moving to next step.

 2) Blocks (interruptibly) until it can get the semaphore
	If interrupted, just return. Assume the signal will either
	kill the task, or direct execution away from the instruction
	that is trying to get the bus lock.
 3) Disables split lock detection for the current core
 4) Schedules a work queue to re-enable split lock detect in 2 jiffies
 5) Returns

The work queue that re-enables split lock detection also releases the
semaphore.

There is a corner case where a CPU may be taken offline while split lock
detection is disabled. A CPU hotplug handler handles this case.

Old behaviour was to only print the split lock warning on the first
occurrence of a split lock from a task. Preserve that by adding a flag to
the task structure that suppresses subsequent split lock messages from that
task.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220310204854.31752-2-tony.luck@intel.com
---
 kernel/fork.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..f39795f372c2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,6 +1045,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+	tsk->reported_split_lock = 0;
+#endif
+
 	return tsk;
 
 free_stack:
-- 
cgit v1.2.3


From 6695da58f9445f386516ed53a3e870f534d2645d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2022 15:33:39 -0400
Subject: ring-buffer: Have absolute time stamps handle large numbers

There's an absolute timestamp event in the ring buffer, but this only
saves 59 bits of the timestamp, as the 5 MSB is used for meta data
(stating it is an absolute time stamp). This was never an issue as all the
clocks currently in use never used those 5 MSB. But now there's a new
clock (TAI) that does.

To handle this case, when reading an absolute timestamp, a previous full
timestamp is passed in, and the 5 MSB of that timestamp is OR'd to the
absolute timestamp (if any of the 5 MSB are set), and then to test for
overflow, if the new result is smaller than the passed in previous
timestamp, then 1 << 59 is added to it.

All the extra processing is done on the reader "slow" path, with the
exception of the "too big delta" check, and the reading of timestamps
for histograms.

Note, libtraceevent will need to be updated to handle this case as well.
But this is not a user space regression, as user space was never able to
handle any timestamps that used more than 59 bits.

Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/
Link: https://lkml.kernel.org/r/20220427153339.16c33f75@gandalf.local.home

Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 49 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 655d6db3e3c3..3a0c7ed0e93f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -29,6 +29,14 @@
 
 #include <asm/local.h>
 
+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB		(0xf8ULL << 56)
+#define ABS_TS_MASK	(~TS_MSB)
+
 static void update_pages_handler(struct work_struct *work);
 
 /*
@@ -783,6 +791,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
 }
 #endif
 
+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+	if (save_ts & TS_MSB) {
+		abs |= save_ts & TS_MSB;
+		/* Check for overflow */
+		if (unlikely(abs < save_ts))
+			abs += 1ULL << 59;
+	}
+	return abs;
+}
 
 static inline u64 rb_time_stamp(struct trace_buffer *buffer);
 
@@ -811,8 +837,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
 	u64 ts;
 
 	/* If the event includes an absolute time, then just use that */
-	if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
-		return rb_event_time_stamp(event);
+	if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+		ts = rb_event_time_stamp(event);
+		return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+	}
 
 	nest = local_read(&cpu_buffer->committing);
 	verify_event(cpu_buffer, event);
@@ -2754,8 +2782,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
 
 	if (unlikely(info->delta > (1ULL << 59))) {
+		/*
+		 * Some timers can use more than 59 bits, and when a timestamp
+		 * is added to the buffer, it will lose those bits.
+		 */
+		if (abs && (info->ts & TS_MSB)) {
+			info->delta &= ABS_TS_MASK;
+
 		/* did the clock go backwards */
-		if (info->before == info->after && info->before > info->ts) {
+		} else if (info->before == info->after && info->before > info->ts) {
 			/* not interrupted */
 			static int once;
 
@@ -3304,7 +3339,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
 
 		case RINGBUF_TYPE_TIME_STAMP:
 			delta = rb_event_time_stamp(event);
-			ts = delta;
+			ts = rb_fix_abs_ts(delta, ts);
 			pr_warn("  [%lld] absolute:%lld TIME STAMP\n", ts, delta);
 			break;
 
@@ -3380,7 +3415,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 
 		case RINGBUF_TYPE_TIME_STAMP:
 			delta = rb_event_time_stamp(event);
-			ts = delta;
+			ts = rb_fix_abs_ts(delta, ts);
 			break;
 
 		case RINGBUF_TYPE_PADDING:
@@ -4367,6 +4402,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 
 	case RINGBUF_TYPE_TIME_STAMP:
 		delta = rb_event_time_stamp(event);
+		delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
 		cpu_buffer->read_stamp = delta;
 		return;
 
@@ -4397,6 +4433,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 
 	case RINGBUF_TYPE_TIME_STAMP:
 		delta = rb_event_time_stamp(event);
+		delta = rb_fix_abs_ts(delta, iter->read_stamp);
 		iter->read_stamp = delta;
 		return;
 
@@ -4650,6 +4687,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 	case RINGBUF_TYPE_TIME_STAMP:
 		if (ts) {
 			*ts = rb_event_time_stamp(event);
+			*ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
@@ -4741,6 +4779,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_TIME_STAMP:
 		if (ts) {
 			*ts = rb_event_time_stamp(event);
+			*ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
-- 
cgit v1.2.3


From f03f2abce4f3944b9576bc4ad28b75890e907d7c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2022 17:08:12 -0400
Subject: ring-buffer: Have 32 bit time stamps use all 64 bits

When the new logic was made to handle deltas of events from interrupts
that interrupted other events, it required 64 bit local atomics.
Unfortunately, 64 bit local atomics are expensive on 32 bit architectures.
Thus, commit 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations
for speeding up 32 bit") created a type of seq lock timer for 32 bits.
It used two 32 bit local atomics, but required 2 bits from them each for
synchronization, making it only 60 bits.

Add a new "msb" field to hold the extra 4 bits that are cut off.

Link: https://lore.kernel.org/all/20220426175338.3807ca4f@gandalf.local.home/
Link: https://lkml.kernel.org/r/20220427170812.53cc7139@gandalf.local.home

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3a0c7ed0e93f..d59b6a328b7f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -476,6 +476,7 @@ struct rb_time_struct {
 	local_t		cnt;
 	local_t		top;
 	local_t		bottom;
+	local_t		msb;
 };
 #else
 #include <asm/local64.h>
@@ -577,7 +578,6 @@ struct ring_buffer_iter {
  * For the ring buffer, 64 bit required operations for the time is
  * the following:
  *
- *  - Only need 59 bits (uses 60 to make it even).
  *  - Reads may fail if it interrupted a modification of the time stamp.
  *      It will succeed if it did not interrupt another write even if
  *      the read itself is interrupted by a write.
@@ -602,6 +602,7 @@ struct ring_buffer_iter {
  */
 #define RB_TIME_SHIFT	30
 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+#define RB_TIME_MSB_SHIFT	 60
 
 static inline int rb_time_cnt(unsigned long val)
 {
@@ -621,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
 
 static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 {
-	unsigned long top, bottom;
+	unsigned long top, bottom, msb;
 	unsigned long c;
 
 	/*
@@ -633,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 		c = local_read(&t->cnt);
 		top = local_read(&t->top);
 		bottom = local_read(&t->bottom);
+		msb = local_read(&t->msb);
 	} while (c != local_read(&t->cnt));
 
 	*cnt = rb_time_cnt(top);
@@ -641,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 	if (*cnt != rb_time_cnt(bottom))
 		return false;
 
-	*ret = rb_time_val(top, bottom);
+	/* The shift to msb will lose its cnt bits */
+	*ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
 	return true;
 }
 
@@ -657,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt
 	return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
 }
 
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
+				 unsigned long *msb)
 {
 	*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
 	*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+	*msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
 }
 
 static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
@@ -671,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long
 
 static void rb_time_set(rb_time_t *t, u64 val)
 {
-	unsigned long cnt, top, bottom;
+	unsigned long cnt, top, bottom, msb;
 
-	rb_time_split(val, &top, &bottom);
+	rb_time_split(val, &top, &bottom, &msb);
 
 	/* Writes always succeed with a valid number even if it gets interrupted. */
 	do {
 		cnt = local_inc_return(&t->cnt);
 		rb_time_val_set(&t->top, top, cnt);
 		rb_time_val_set(&t->bottom, bottom, cnt);
+		rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
 	} while (cnt != local_read(&t->cnt));
 }
 
@@ -694,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
 
 static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 {
-	unsigned long cnt, top, bottom;
-	unsigned long cnt2, top2, bottom2;
+	unsigned long cnt, top, bottom, msb;
+	unsigned long cnt2, top2, bottom2, msb2;
 	u64 val;
 
 	/* The cmpxchg always fails if it interrupted an update */
@@ -711,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 
 	 cnt2 = cnt + 1;
 
-	 rb_time_split(val, &top, &bottom);
+	 rb_time_split(val, &top, &bottom, &msb);
 	 top = rb_time_val_cnt(top, cnt);
 	 bottom = rb_time_val_cnt(bottom, cnt);
 
-	 rb_time_split(set, &top2, &bottom2);
+	 rb_time_split(set, &top2, &bottom2, &msb2);
 	 top2 = rb_time_val_cnt(top2, cnt2);
 	 bottom2 = rb_time_val_cnt(bottom2, cnt2);
 
 	if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
 		return false;
+	if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+		return false;
 	if (!rb_time_read_cmpxchg(&t->top, top, top2))
 		return false;
 	if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
-- 
cgit v1.2.3


From c575afe21ccc47c5561d11493ede81ab7d072267 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Thu, 14 Apr 2022 11:18:04 +0200
Subject: tracing: Introduce trace clock tai

A fast/NMI safe accessor for CLOCK_TAI has been introduced.
Use it for adding the additional trace clock "tai".

Link: https://lkml.kernel.org/r/20220414091805.89667-3-kurt@linutronix.de

Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 400d3e9fe9ff..087740f63d92 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1542,6 +1542,7 @@ static struct {
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
 	{ ktime_get_boot_fast_ns,	"boot",		1 },
+	{ ktime_get_tai_fast_ns,	"tai",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
-- 
cgit v1.2.3


From 1da27a25054f7660f7bb27ad4ccf036ee6ba51e9 Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Wed, 27 Apr 2022 19:07:31 +0200
Subject: tracing: Remove usage of list iterator after the loop body

In preparation to limit the scope of the list iterator variable to the
traversal loop, use a dedicated pointer to point to the found element
[1].

Before, the code implicitly used the head when no element was found
when using &pos->list. Since the new variable is only set if an
element was found, the head needs to be used explicitly if the
variable is NULL.

Link: https://lkml.kernel.org/r/20220427170734.819891-2-jakobkoschel@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1]
Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d89e3f7e26eb..67f47ea27921 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list);
 
 static int trace_search_list(struct list_head **list)
 {
-	struct trace_event *e;
+	struct trace_event *e = NULL, *iter;
 	int next = __TRACE_LAST_TYPE;
 
 	if (list_empty(&ftrace_event_list)) {
@@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list)
 	 * We used up all possible max events,
 	 * lets see if somebody freed one.
 	 */
-	list_for_each_entry(e, &ftrace_event_list, list) {
-		if (e->type != next)
+	list_for_each_entry(iter, &ftrace_event_list, list) {
+		if (iter->type != next) {
+			e = iter;
 			break;
+		}
 		next++;
 	}
 
@@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list)
 	if (next > TRACE_EVENT_TYPE_MAX)
 		return 0;
 
-	*list = &e->list;
+	if (e)
+		*list = &e->list;
+	else
+		*list = &ftrace_event_list;
 	return next;
 }
 
-- 
cgit v1.2.3


From 99d8ae4ec8a90ecf6c2a6a141bd4db4a3bc5146c Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Wed, 27 Apr 2022 19:07:32 +0200
Subject: tracing: Remove usage of list iterator variable after the loop

In preparation to limit the scope of a list iterator to the list
traversal loop, use a dedicated pointer to point to the found element
[1].

Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1]
Link: https://lkml.kernel.org/r/20220427170734.819891-3-jakobkoschel@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e11e167b7809..e4a442060707 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1723,9 +1723,9 @@ static LIST_HEAD(event_subsystems);
 
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
+	struct trace_subsystem_dir *dir = NULL, *iter_dir;
+	struct trace_array *tr = NULL, *iter_tr;
 	struct event_subsystem *system = NULL;
-	struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
-	struct trace_array *tr;
 	int ret;
 
 	if (tracing_is_disabled())
@@ -1734,10 +1734,12 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 	/* Make sure the system still exists */
 	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
-	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-		list_for_each_entry(dir, &tr->systems, list) {
-			if (dir == inode->i_private) {
+	list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
+		list_for_each_entry(iter_dir, &iter_tr->systems, list) {
+			if (iter_dir == inode->i_private) {
 				/* Don't open systems with no events */
+				tr = iter_tr;
+				dir = iter_dir;
 				if (dir->nr_events) {
 					__get_system_dir(dir);
 					system = dir->subsystem;
@@ -1753,9 +1755,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 	if (!system)
 		return -ENODEV;
 
-	/* Some versions of gcc think dir can be uninitialized here */
-	WARN_ON(!dir);
-
 	/* Still need to increment the ref count of the system */
 	if (trace_array_get(tr) < 0) {
 		put_system(dir);
-- 
cgit v1.2.3


From 45e333ce2ad5cbb0ee05686336de09058c6af8ca Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Wed, 27 Apr 2022 19:07:33 +0200
Subject: tracing: Replace usage of found with dedicated list iterator variable

To move the list iterator variable into the list_for_each_entry_*()
macro in the future it should be avoided to use the list iterator
variable after the loop body.

To *never* use the list iterator variable after the loop it was
concluded to use a separate iterator variable instead of a
found boolean [1].

This removes the need to use a found variable and simply checking if
the variable was set, can determine if the break/goto was hit.

Link: https://lkml.kernel.org/r/20220427170734.819891-4-jakobkoschel@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1]
Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c    | 15 +++++++--------
 kernel/trace/trace_events_trigger.c | 24 +++++++++++-------------
 2 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fe10179893c1..038dc591545d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6117,20 +6117,19 @@ static void hist_unregister_trigger(char *glob,
 				    struct event_trigger_data *data,
 				    struct trace_event_file *file)
 {
+	struct event_trigger_data *test = NULL, *iter, *named_data = NULL;
 	struct hist_trigger_data *hist_data = data->private_data;
-	struct event_trigger_data *test, *named_data = NULL;
-	bool unregistered = false;
 
 	lockdep_assert_held(&event_mutex);
 
 	if (hist_data->attrs->name)
 		named_data = find_named_trigger(hist_data->attrs->name);
 
-	list_for_each_entry(test, &file->triggers, list) {
-		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test, named_data, false))
+	list_for_each_entry(iter, &file->triggers, list) {
+		if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, iter, named_data, false))
 				continue;
-			unregistered = true;
+			test = iter;
 			list_del_rcu(&test->list);
 			trace_event_trigger_enable_disable(file, 0);
 			update_cond_flag(file);
@@ -6138,11 +6137,11 @@ static void hist_unregister_trigger(char *glob,
 		}
 	}
 
-	if (unregistered && test->ops->free)
+	if (test && test->ops->free)
 		test->ops->free(test);
 
 	if (hist_data->enable_timestamps) {
-		if (!hist_data->remove || unregistered)
+		if (!hist_data->remove || test)
 			tracing_set_filter_buffering(file->tr, false);
 	}
 }
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 62c44dab8f46..21592bed2e89 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -609,14 +609,13 @@ static void unregister_trigger(char *glob,
 			       struct event_trigger_data *test,
 			       struct trace_event_file *file)
 {
-	struct event_trigger_data *data;
-	bool unregistered = false;
+	struct event_trigger_data *data = NULL, *iter;
 
 	lockdep_assert_held(&event_mutex);
 
-	list_for_each_entry(data, &file->triggers, list) {
-		if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
-			unregistered = true;
+	list_for_each_entry(iter, &file->triggers, list) {
+		if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+			data = iter;
 			list_del_rcu(&data->list);
 			trace_event_trigger_enable_disable(file, 0);
 			update_cond_flag(file);
@@ -624,7 +623,7 @@ static void unregister_trigger(char *glob,
 		}
 	}
 
-	if (unregistered && data->ops->free)
+	if (data && data->ops->free)
 		data->ops->free(data);
 }
 
@@ -1870,19 +1869,18 @@ void event_enable_unregister_trigger(char *glob,
 				     struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
+	struct event_trigger_data *data = NULL, *iter;
 	struct enable_trigger_data *enable_data;
-	struct event_trigger_data *data;
-	bool unregistered = false;
 
 	lockdep_assert_held(&event_mutex);
 
-	list_for_each_entry(data, &file->triggers, list) {
-		enable_data = data->private_data;
+	list_for_each_entry(iter, &file->triggers, list) {
+		enable_data = iter->private_data;
 		if (enable_data &&
-		    (data->cmd_ops->trigger_type ==
+		    (iter->cmd_ops->trigger_type ==
 		     test->cmd_ops->trigger_type) &&
 		    (enable_data->file == test_enable_data->file)) {
-			unregistered = true;
+			data = iter;
 			list_del_rcu(&data->list);
 			trace_event_trigger_enable_disable(file, 0);
 			update_cond_flag(file);
@@ -1890,7 +1888,7 @@ void event_enable_unregister_trigger(char *glob,
 		}
 	}
 
-	if (unregistered && data->ops->free)
+	if (data && data->ops->free)
 		data->ops->free(data);
 }
 
-- 
cgit v1.2.3


From ba27d8555867b0e02e15709f4ddb79aec5cf2efc Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Wed, 27 Apr 2022 19:07:34 +0200
Subject: tracing: Remove check of list iterator against head past the loop
 body

When list_for_each_entry() completes the iteration over the whole list
without breaking the loop, the iterator value will be a bogus pointer
computed based on the head element.

While it is safe to use the pointer to determine if it was computed
based on the head element, either with list_entry_is_head() or
&pos->member == head, using the iterator variable after the loop should
be avoided.

In preparation to limit the scope of a list iterator to the list
traversal loop, use a dedicated pointer to point to the found element [1].

Link: https://lkml.kernel.org/r/20220427170734.819891-5-jakobkoschel@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/
Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c       | 20 ++++++++++++--------
 kernel/trace/trace_eprobe.c | 14 ++++++++------
 kernel/trace/trace_events.c | 12 ++++++------
 3 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f1d2f5e7263..5c465e70d146 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4560,8 +4560,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
 			       struct ftrace_probe_ops *probe_ops,
 			       void *data)
 {
+	struct ftrace_func_probe *probe = NULL, *iter;
 	struct ftrace_func_entry *entry;
-	struct ftrace_func_probe *probe;
 	struct ftrace_hash **orig_hash;
 	struct ftrace_hash *old_hash;
 	struct ftrace_hash *hash;
@@ -4580,11 +4580,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
 
 	mutex_lock(&ftrace_lock);
 	/* Check if the probe_ops is already registered */
-	list_for_each_entry(probe, &tr->func_probes, list) {
-		if (probe->probe_ops == probe_ops)
+	list_for_each_entry(iter, &tr->func_probes, list) {
+		if (iter->probe_ops == probe_ops) {
+			probe = iter;
 			break;
+		}
 	}
-	if (&probe->list == &tr->func_probes) {
+	if (!probe) {
 		probe = kzalloc(sizeof(*probe), GFP_KERNEL);
 		if (!probe) {
 			mutex_unlock(&ftrace_lock);
@@ -4702,9 +4704,9 @@ int
 unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 				      struct ftrace_probe_ops *probe_ops)
 {
+	struct ftrace_func_probe *probe = NULL, *iter;
 	struct ftrace_ops_hash old_hash_ops;
 	struct ftrace_func_entry *entry;
-	struct ftrace_func_probe *probe;
 	struct ftrace_glob func_g;
 	struct ftrace_hash **orig_hash;
 	struct ftrace_hash *old_hash;
@@ -4732,11 +4734,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 
 	mutex_lock(&ftrace_lock);
 	/* Check if the probe_ops is already registered */
-	list_for_each_entry(probe, &tr->func_probes, list) {
-		if (probe->probe_ops == probe_ops)
+	list_for_each_entry(iter, &tr->func_probes, list) {
+		if (iter->probe_ops == probe_ops) {
+			probe = iter;
 			break;
+		}
 	}
-	if (&probe->list == &tr->func_probes)
+	if (!probe)
 		goto err_unlock_ftrace;
 
 	ret = -EINVAL;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index b045fa9f276c..7d4478525c66 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -648,7 +648,7 @@ static struct trace_event_functions eprobe_funcs = {
 static int disable_eprobe(struct trace_eprobe *ep,
 			  struct trace_array *tr)
 {
-	struct event_trigger_data *trigger;
+	struct event_trigger_data *trigger = NULL, *iter;
 	struct trace_event_file *file;
 	struct eprobe_data *edata;
 
@@ -656,14 +656,16 @@ static int disable_eprobe(struct trace_eprobe *ep,
 	if (!file)
 		return -ENOENT;
 
-	list_for_each_entry(trigger, &file->triggers, list) {
-		if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE))
+	list_for_each_entry(iter, &file->triggers, list) {
+		if (!(iter->flags & EVENT_TRIGGER_FL_PROBE))
 			continue;
-		edata = trigger->private_data;
-		if (edata->ep == ep)
+		edata = iter->private_data;
+		if (edata->ep == ep) {
+			trigger = iter;
 			break;
+		}
 	}
-	if (list_entry_is_head(trigger, &file->triggers, list))
+	if (!trigger)
 		return -ENODEV;
 
 	list_del_rcu(&trigger->list);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e4a442060707..78f313b7b315 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2279,8 +2279,8 @@ static struct dentry *
 event_subsystem_dir(struct trace_array *tr, const char *name,
 		    struct trace_event_file *file, struct dentry *parent)
 {
+	struct event_subsystem *system, *iter;
 	struct trace_subsystem_dir *dir;
-	struct event_subsystem *system;
 	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
@@ -2294,13 +2294,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	}
 
 	/* Now see if the system itself exists. */
-	list_for_each_entry(system, &event_subsystems, list) {
-		if (strcmp(system->name, name) == 0)
+	system = NULL;
+	list_for_each_entry(iter, &event_subsystems, list) {
+		if (strcmp(iter->name, name) == 0) {
+			system = iter;
 			break;
+		}
 	}
-	/* Reset system variable when not found */
-	if (&system->list == &event_subsystems)
-		system = NULL;
 
 	dir = kmalloc(sizeof(*dir), GFP_KERNEL);
 	if (!dir)
-- 
cgit v1.2.3


From 2c33d775ef4c25c0e1e1cc0fd5496d02f76bfa20 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@linutronix.de>
Date: Thu, 28 Apr 2022 08:24:32 +0200
Subject: timekeeping: Mark NMI safe time accessors as notrace

Mark the CLOCK_MONOTONIC fast time accessors as notrace. These functions are
used in tracing to retrieve timestamps, so they should not recurse.

Fixes: 4498e7467e9e ("time: Parametrize all tk_fast_mono users")
Fixes: f09cb9a1808e ("time: Introduce tk_fast_raw")
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220426175338.3807ca4f@gandalf.local.home/
Link: https://lore.kernel.org/r/20220428062432.61063-1-kurt@linutronix.de
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dcdcb85121e4..3b1398fbddaf 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -482,7 +482,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
  * of the following timestamps. Callers need to be aware of that and
  * deal with it.
  */
-u64 ktime_get_mono_fast_ns(void)
+u64 notrace ktime_get_mono_fast_ns(void)
 {
 	return __ktime_get_fast_ns(&tk_fast_mono);
 }
@@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
  * Contrary to ktime_get_mono_fast_ns() this is always correct because the
  * conversion factor is not affected by NTP/PTP correction.
  */
-u64 ktime_get_raw_fast_ns(void)
+u64 notrace ktime_get_raw_fast_ns(void)
 {
 	return __ktime_get_fast_ns(&tk_fast_raw);
 }
-- 
cgit v1.2.3


From 4e3c7d338a2260406ae22eaf6d77b639d59bdc7e Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 18 Mar 2022 17:36:56 +0100
Subject: sched/fair: Refactor cpu_util_without()

Except the 'task has no contribution or is new' condition at the
beginning of cpu_util_without(), which it shares with the load and
runnable counterpart functions, a cpu_util_next(..., dst_cpu = -1)
call can replace the rest of it.

The UTIL_EST specific check that task util_est has to be subtracted
from the CPU one in case of an enqueued (or current (to cater for the
wakeup - lb race)) task has to be moved to cpu_util_next().
This was initially introduced by commit c469933e7721
("sched/fair: Fix cpu_util_wake() for 'execl' type workloads").
UnixBench's `execl` throughput tests were run on the dual socket 40
CPUs Intel E5-2690 v2 to make sure it doesn't regress again.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20220318163656.954440-1-dietmar.eggemann@arm.com
---
 kernel/sched/fair.c | 157 +++++++++++++++++++---------------------------------
 1 file changed, 57 insertions(+), 100 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c420124b5d6..7d38728ebe97 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6544,108 +6544,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 }
 
 /*
- * cpu_util_without: compute cpu utilization without any contributions from *p
- * @cpu: the CPU which utilization is requested
- * @p: the task which utilization should be discounted
- *
- * The utilization of a CPU is defined by the utilization of tasks currently
- * enqueued on that CPU as well as tasks which are currently sleeping after an
- * execution on that CPU.
- *
- * This method returns the utilization of the specified CPU by discounting the
- * utilization of the specified task, whenever the task is currently
- * contributing to the CPU utilization.
- */
-static unsigned long cpu_util_without(int cpu, struct task_struct *p)
-{
-	struct cfs_rq *cfs_rq;
-	unsigned int util;
-
-	/* Task has no contribution or is new */
-	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
-		return cpu_util_cfs(cpu);
-
-	cfs_rq = &cpu_rq(cpu)->cfs;
-	util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	/* Discount task's util from CPU's util */
-	lsub_positive(&util, task_util(p));
-
-	/*
-	 * Covered cases:
-	 *
-	 * a) if *p is the only task sleeping on this CPU, then:
-	 *      cpu_util (== task_util) > util_est (== 0)
-	 *    and thus we return:
-	 *      cpu_util_without = (cpu_util - task_util) = 0
-	 *
-	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
-	 *    IDLE, then:
-	 *      cpu_util >= task_util
-	 *      cpu_util > util_est (== 0)
-	 *    and thus we discount *p's blocked utilization to return:
-	 *      cpu_util_without = (cpu_util - task_util) >= 0
-	 *
-	 * c) if other tasks are RUNNABLE on that CPU and
-	 *      util_est > cpu_util
-	 *    then we use util_est since it returns a more restrictive
-	 *    estimation of the spare capacity on that CPU, by just
-	 *    considering the expected utilization of tasks already
-	 *    runnable on that CPU.
-	 *
-	 * Cases a) and b) are covered by the above code, while case c) is
-	 * covered by the following code when estimated utilization is
-	 * enabled.
-	 */
-	if (sched_feat(UTIL_EST)) {
-		unsigned int estimated =
-			READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
-		/*
-		 * Despite the following checks we still have a small window
-		 * for a possible race, when an execl's select_task_rq_fair()
-		 * races with LB's detach_task():
-		 *
-		 *   detach_task()
-		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
-		 *     ---------------------------------- A
-		 *     deactivate_task()                   \
-		 *       dequeue_task()                     + RaceTime
-		 *         util_est_dequeue()              /
-		 *     ---------------------------------- B
-		 *
-		 * The additional check on "current == p" it's required to
-		 * properly fix the execl regression and it helps in further
-		 * reducing the chances for the above race.
-		 */
-		if (unlikely(task_on_rq_queued(p) || current == p))
-			lsub_positive(&estimated, _task_util_est(p));
-
-		util = max(util, estimated);
-	}
-
-	/*
-	 * Utilization (estimated) can exceed the CPU capacity, thus let's
-	 * clamp to the maximum CPU capacity to ensure consistency with
-	 * cpu_util.
-	 */
-	return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
  */
 static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 {
 	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
-	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
 
 	/*
-	 * If @p migrates from @cpu to another, remove its contribution. Or,
-	 * if @p migrates from another CPU to @cpu, add its contribution. In
-	 * the other cases, @cpu is not impacted by the migration, so the
-	 * util_avg should already be correct.
+	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+	 * contribution. If @p migrates from another CPU to @cpu add its
+	 * contribution. In all the other cases @cpu is not impacted by the
+	 * migration so its util_avg is already correct.
 	 */
 	if (task_cpu(p) == cpu && dst_cpu != cpu)
 		lsub_positive(&util, task_util(p));
@@ -6653,16 +6564,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 		util += task_util(p);
 
 	if (sched_feat(UTIL_EST)) {
+		unsigned long util_est;
+
 		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
 
 		/*
-		 * During wake-up, the task isn't enqueued yet and doesn't
-		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
-		 * so just add it (if needed) to "simulate" what will be
-		 * cpu_util after the task has been enqueued.
+		 * During wake-up @p isn't enqueued yet and doesn't contribute
+		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+		 * has been enqueued.
+		 *
+		 * During exec (@dst_cpu = -1) @p is enqueued and does
+		 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+		 * Remove it to "simulate" cpu_util without @p's contribution.
+		 *
+		 * Despite the task_on_rq_queued(@p) check there is still a
+		 * small window for a possible race when an exec
+		 * select_task_rq_fair() races with LB's detach_task().
+		 *
+		 *   detach_task()
+		 *     deactivate_task()
+		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
+		 *       -------------------------------- A
+		 *       dequeue_task()                    \
+		 *         dequeue_task_fair()              + Race Time
+		 *           util_est_dequeue()            /
+		 *       -------------------------------- B
+		 *
+		 * The additional check "current == p" is required to further
+		 * reduce the race window.
 		 */
 		if (dst_cpu == cpu)
 			util_est += _task_util_est(p);
+		else if (unlikely(task_on_rq_queued(p) || current == p))
+			lsub_positive(&util_est, _task_util_est(p));
 
 		util = max(util, util_est);
 	}
@@ -6670,6 +6605,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 	return min(util, capacity_orig_of(cpu));
 }
 
+/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+	/* Task has no contribution or is new */
+	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+		return cpu_util_cfs(cpu);
+
+	return cpu_util_next(cpu, p, -1);
+}
+
 /*
  * compute_energy(): Estimates the energy that @pd would consume if @p was
  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
-- 
cgit v1.2.3


From 50e7b416d2ab10b9771bd00a4d85df90ad2e4b37 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 28 Apr 2022 15:43:37 +0100
Subject: sched/fair: Remove sched_trace_*() helper functions

We no longer need them as we can use DWARF debug info or BTF + pahole to
re-generate the required structs to compile against them for a given
kernel.

This moves the burden of maintaining these helper functions to the
module.

	https://github.com/qais-yousef/sched_tp

Note that pahole v1.15 is required at least for using DWARF. And for BTF
v1.23 which is not yet released will be required. There's alignment
problem that will lead to crashes in earlier versions when used with
BTF.

We should have enough infrastructure to make these helper functions now
obsolete, so remove them.

[Rewrote commit message to reflect the new alternative]
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220428144338.479094-2-qais.yousef@arm.com
---
 kernel/sched/fair.c | 98 -----------------------------------------------------
 1 file changed, 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d38728ebe97..19803e1c9138 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11839,101 +11839,3 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
-	return cfs_rq ? &cfs_rq->avg : NULL;
-#else
-	return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
-	if (!cfs_rq) {
-		if (str)
-			strlcpy(str, "(null)", len);
-		else
-			return NULL;
-	}
-
-	cfs_rq_tg_path(cfs_rq, str, len);
-	return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	return rq ? &rq->avg_rt : NULL;
-#else
-	return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-	return rq ? &rq->avg_dl : NULL;
-#else
-	return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
-	return rq ? &rq->avg_irq : NULL;
-#else
-	return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
-	return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
-	return rq ?
-#ifdef CONFIG_SMP
-		rq->cpu_capacity
-#else
-		SCHED_CAPACITY_SCALE
-#endif
-		: -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
-	return rd ? rd->span : NULL;
-#else
-	return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
-        return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
-- 
cgit v1.2.3


From 97956dd278d3af1b5657026b992b54cf2e1b50b9 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 28 Apr 2022 15:43:38 +0100
Subject: sched/fair: Remove cfs_rq_tg_path()

cfs_rq_tg_path() is used by a tracepoint-to traceevent (tp-2-te)
converter to format the path of a taskgroup or autogroup respectively.
It doesn't have any in-kernel users after the removal of the
sched_trace_cfs_rq_path() helper function.

cfs_rq_tg_path() can be coded in a tp-2-te converter.

Remove it from kernel/sched/fair.c.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220428144338.479094-3-qais.yousef@arm.com
---
 kernel/sched/fair.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 19803e1c9138..6ca054b11a16 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -313,19 +313,6 @@ const struct sched_class fair_sched_class;
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
-	if (!path)
-		return;
-
-	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
-		autogroup_path(cfs_rq->tg, path, len);
-	else if (cfs_rq && cfs_rq->tg->css.cgroup)
-		cgroup_path(cfs_rq->tg->css.cgroup, path, len);
-	else
-		strlcpy(path, "(null)", len);
-}
-
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -493,12 +480,6 @@ static int se_is_idle(struct sched_entity *se)
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
-	if (path)
-		strlcpy(path, "(null)", len);
-}
-
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	return true;
-- 
cgit v1.2.3


From e999995c84c3abb6dcae83f8c35942a8d4ee0451 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Thu, 21 Apr 2022 00:00:05 +0800
Subject: ftrace: cleanup ftrace_graph_caller enable and disable

The ftrace_[enable,disable]_ftrace_graph_caller() are used to do
special hooks for graph tracer, which are not needed on some ARCHs
that use graph_ops:func function to install return_hooker.

So introduce the weak version in ftrace core code to cleanup
in x86.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20220420160006.17880-1-zhouchengming@bytedance.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/trace/fgraph.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8f4fb328133a..289311680c29 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -30,6 +30,24 @@ int ftrace_graph_active;
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
+/*
+ * archs can override this function if they must do something
+ * to enable hook for graph tracer.
+ */
+int __weak ftrace_enable_ftrace_graph_caller(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * to disable hook for graph tracer.
+ */
+int __weak ftrace_disable_ftrace_graph_caller(void)
+{
+	return 0;
+}
+
 /**
  * ftrace_graph_stop - set to permanently disable function graph tracing
  *
-- 
cgit v1.2.3


From 4cbf6f621150e4fca78543067260f68fab0ee328 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Wed, 27 Apr 2022 18:54:46 -0700
Subject: seccomp: Use FIFO semantics to order notifications

Previously, the seccomp notifier used LIFO semantics, where each
notification would be added on top of the stack, and notifications
were popped off the top of the stack. This could result one process
that generates a large number of notifications preventing other
notifications from being handled. This patch moves from LIFO (stack)
semantics to FIFO (queue semantics).

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220428015447.13661-1-sargun@sargun.me
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b5ac87f6dbd4..3caa0fe85235 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1100,7 +1100,7 @@ static int seccomp_do_user_notification(int this_syscall,
 	n.data = sd;
 	n.id = seccomp_next_notify_id(match);
 	init_completion(&n.ready);
-	list_add(&n.list, &match->notif->notifications);
+	list_add_tail(&n.list, &match->notif->notifications);
 	INIT_LIST_HEAD(&n.addfd);
 
 	up(&match->notif->request);
-- 
cgit v1.2.3


From c06d7aaf2951ce7f986a879127995728d63d8577 Mon Sep 17 00:00:00 2001
From: Haowen Bai <baihaowen@meizu.com>
Date: Fri, 29 Apr 2022 14:38:00 -0700
Subject: kernel: pid_namespace: use NULL instead of using plain integer as
 pointer

This fixes the following sparse warnings:
kernel/pid_namespace.c:55:77: warning: Using plain integer as NULL pointer

Link: https://lkml.kernel.org/r/1647944288-2806-1-git-send-email-baihaowen@meizu.com
Signed-off-by: Haowen Bai <baihaowen@meizu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/pid_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a46a3723bc66..f4f8cb0435b4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
 	/* Name collision forces to do allocation under mutex. */
 	if (!*pkc)
 		*pkc = kmem_cache_create(name, len, 0,
-					 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
+					 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 	mutex_unlock(&pid_caches_mutex);
 	/* current can fail, but someone else can succeed. */
 	return READ_ONCE(*pkc);
-- 
cgit v1.2.3


From 67fca000e1e173fe2c539a127ccf1bc338d5ff37 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 29 Apr 2022 14:38:00 -0700
Subject: lib/Kconfig.debug: remove more CONFIG_..._VALUE indirections

As in "kernel/panic.c: remove CONFIG_PANIC_ON_OOPS_VALUE indirection",
use the IS_ENABLED() helper rather than having a hidden config option.

Link: https://lkml.kernel.org/r/20220321121301.1389693-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/hung_task.c | 2 +-
 kernel/watchdog.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 52501e5f7655..cff3ae8c818f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
  * hung task is detected:
  */
 unsigned int __read_mostly sysctl_hung_task_panic =
-				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+				IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
 
 static int
 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9166220457bc..ecb0e8346e65 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
  * Should we panic when a soft-lockup or hard-lockup occurs:
  */
 unsigned int __read_mostly hardlockup_panic =
-			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+			IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
 /*
  * We may not want to enable hard lockup detection by default in all cases,
  * for example when running the kernel as a guest on a hypervisor. In these
@@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
 
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
-			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+			IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
 
 static bool softlockup_initialized __read_mostly;
 static u64 __read_mostly sample_period;
-- 
cgit v1.2.3


From f26b2afd53e70db67be8252d340b4a1387ec8b55 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 29 Apr 2022 14:38:02 -0700
Subject: ptrace: remove redudant check of #ifdef PTRACE_SINGLESTEP

Patch series "ptrace: do some cleanup".


This patch (of 3):

PTRACE_SINGLESTEP is always defined as 9 in include/uapi/linux/ptrace.h,
remove redudant check of #ifdef PTRACE_SINGLESTEP.

Link: https://lkml.kernel.org/r/1649240981-11024-2-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ptrace.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ccc4b465775b..49c29baf9907 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -829,11 +829,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
 }
 #endif
 
-#ifdef PTRACE_SINGLESTEP
 #define is_singlestep(request)		((request) == PTRACE_SINGLESTEP)
-#else
-#define is_singlestep(request)		0
-#endif
 
 #ifdef PTRACE_SINGLEBLOCK
 #define is_singleblock(request)		((request) == PTRACE_SINGLEBLOCK)
@@ -1221,9 +1217,7 @@ int ptrace_request(struct task_struct *child, long request,
 	}
 #endif
 
-#ifdef PTRACE_SINGLESTEP
 	case PTRACE_SINGLESTEP:
-#endif
 #ifdef PTRACE_SINGLEBLOCK
 	case PTRACE_SINGLEBLOCK:
 #endif
-- 
cgit v1.2.3


From 16b0b7adabfb5564a77fa35917afe08decd55b29 Mon Sep 17 00:00:00 2001
From: Michal Orzel <michalorzel.eng@gmail.com>
Date: Fri, 29 Apr 2022 14:38:03 -0700
Subject: kexec: remove redundant assignments

Get rid of redundant assignments which end up in values not being read
either because they are overwritten or the function ends.

Reported by clang-tidy [deadcode.DeadStores]

Link: https://lkml.kernel.org/r/20220326180948.192154-1-michalorzel.eng@gmail.com
Signed-off-by: Michal Orzel <michalorzel.eng@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Michal Orzel <michalorzel.eng@gmail.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 68480f731192..d08904a27362 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image,
 				kimage_free_pages(old_page);
 				continue;
 			}
-			addr = old_addr;
 			page = old_page;
 			break;
 		}
@@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image,
 	unsigned char __user *buf = NULL;
 	unsigned char *kbuf = NULL;
 
-	result = 0;
 	if (image->file_mode)
 		kbuf = segment->kbuf;
 	else
-- 
cgit v1.2.3


From 0e0af57e0e91b304f36b7d1dba859e3c04094273 Mon Sep 17 00:00:00 2001
From: "Dr. Thomas Orgis" <thomas.orgis@uni-hamburg.de>
Date: Fri, 29 Apr 2022 14:38:03 -0700
Subject: taskstats: version 12 with thread group and exe info

The task exit struct needs some crucial information to be able to provide
an enhanced version of process and thread accounting.  This change
provides:

1. ac_tgid in additon to ac_pid
2. thread group execution walltime in ac_tgetime
3. flag AGROUP in ac_flag to indicate the last task
   in a thread group / process
4. device ID and inode of task's /proc/self/exe in
   ac_exe_dev and ac_exe_inode
5. tools/accounting/procacct as demonstrator

When a task exits, taskstats are reported to userspace including the
task's pid and ppid, but without the id of the thread group this task is
part of.  Without the tgid, the stats of single tasks cannot be correlated
to each other as a thread group (process).

The taskstats documentation suggests that on process exit a data set
consisting of accumulated stats for the whole group is produced.  But such
an additional set of stats is only produced for actually multithreaded
processes, not groups that had only one thread, and also those stats only
contain data about delay accounting and not the more basic information
about CPU and memory resource usage.  Adding the AGROUP flag to be set
when the last task of a group exited enables determination of process end
also for single-threaded processes.

My applicaton basically does enhanced process accounting with summed
cputime, biggest maxrss, tasks per process.  The data is not available
with the traditional BSD process accounting (which is not designed to be
extensible) and the taskstats interface allows more efficient on-the-fly
grouping and summing of the stats, anyway, without intermediate disk
writes.

Furthermore, I do carry statistics on which exact program binary is used
how often with associated resources, getting a picture on how important
which parts of a collection of installed scientific software in different
versions are, and how well they put load on the machine.  This is enabled
by providing information on /proc/self/exe for each task.  I assume the
two 64-bit fields for device ID and inode are more appropriate than the
possibly large resolved path to keep the data volume down.

Add the tgid to the stats to complete task identification, the flag AGROUP
to mark the last task of a group, the group wallclock time, and
inode-based identification of the associated executable file.

Add tools/accounting/procacct.c as a simplified fork of getdelays.c to
demonstrate process and thread accounting.

[thomas.orgis@uni-hamburg.de: fix version number in comment]
  Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster
Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster
Signed-off-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de>
Reviewed-by: Ismael Luceno <ismael@iodev.co.uk>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 23 +++++++++++++++++++++++
 kernel/tsacct.c    | 10 ++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bcac5a9043aa..72415e22342b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
+#include <linux/acct.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb,
 	up_write(&listeners->sem);
 }
 
+static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	/* No idea if I'm allowed to access that here, now. */
+	struct file *exe_file = get_task_exe_file(tsk);
+
+	if (exe_file) {
+		/* Following cp_new_stat64() in stat.c . */
+		stats->ac_exe_dev =
+			huge_encode_dev(exe_file->f_inode->i_sb->s_dev);
+		stats->ac_exe_inode = exe_file->f_inode->i_ino;
+		fput(exe_file);
+	} else {
+		stats->ac_exe_dev = 0;
+		stats->ac_exe_inode = 0;
+	}
+}
+
 static void fill_stats(struct user_namespace *user_ns,
 		       struct pid_namespace *pid_ns,
 		       struct task_struct *tsk, struct taskstats *stats)
@@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns,
 
 	/* fill in extended acct fields */
 	xacct_add_tsk(stats, tsk);
+
+	/* add executable info */
+	exe_add_tsk(stats, tsk);
 }
 
 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
@@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 		goto err;
 
 	fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+	if (group_dead)
+		stats->ac_flag |= AGROUP;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1d261fbe367b..4252f0645b9e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 {
 	const struct cred *tcred;
 	u64 utime, stime, utimescaled, stimescaled;
-	u64 delta;
+	u64 now_ns, delta;
 	time64_t btime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 
 	/* calculate task elapsed time in nsec */
-	delta = ktime_get_ns() - tsk->start_time;
+	now_ns = ktime_get_ns();
+	/* store whole group time first */
+	delta = now_ns - tsk->group_leader->start_time;
 	/* Convert to micro seconds */
 	do_div(delta, NSEC_PER_USEC);
+	stats->ac_tgetime = delta;
+	delta = now_ns - tsk->start_time;
+	do_div(delta, NSEC_PER_USEC);
 	stats->ac_etime = delta;
 	/* Convert to seconds for btime (note y2106 limit) */
 	btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
@@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
 	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);
+	stats->ac_tgid   = task_tgid_nr_ns(tsk, pid_ns);
 	rcu_read_lock();
 	tcred = __task_cred(tsk);
 	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid);
-- 
cgit v1.2.3


From edc73c7261ca3ea79867437bb0b9dab0e232436c Mon Sep 17 00:00:00 2001
From: xu xin <xu.xin16@zte.com.cn>
Date: Fri, 29 Apr 2022 14:38:03 -0700
Subject: kernel: make taskstats available from all net namespaces

If getdelays runs in a non-init network namespace, it will fail in getting
delayacct stats even if it has privilege of root user, which seems to be
not very reasonable.  We can simply reproduce this by executing commands:

	unshare -n
	getdelays -d -p <pid>

I don't think net namespace should be an obstacle to the normal execution
of getdelay function.  So let's make it available from all net namespaces.

Link: https://lkml.kernel.org/r/20220412071946.2532318-1-xu.xin16@zte.com.cn
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: "Dr. Thomas Orgis" <thomas.orgis@uni-hamburg.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Ismael Luceno <ismael@iodev.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 72415e22342b..f7e246336218 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -688,6 +688,7 @@ static struct genl_family family __ro_after_init = {
 	.module		= THIS_MODULE,
 	.ops		= taskstats_ops,
 	.n_ops		= ARRAY_SIZE(taskstats_ops),
+	.netnsok	= true,
 };
 
 /* Needed early in initialization */
-- 
cgit v1.2.3


From e788be95a57a9bebe446878ce9bf2750f6fe4974 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Apr 2022 17:25:16 -0600
Subject: task_work: allow TWA_SIGNAL without a rescheduling IPI

Some use cases don't always need an IPI when sending a TWA_SIGNAL
notification. Add TWA_SIGNAL_NO_IPI, which is just like TWA_SIGNAL, except
it doesn't send an IPI to the target task. It merely sets
TIF_NOTIFY_SIGNAL and wakes up the task.

This can be useful in avoiding a forceful transition to the kernel if the
task is running in userspace. Depending on the task_work in question, it
may be quite fine waiting for the next reschedule or kernel enter anyway,
or the use case may even have other mechanisms for hinting to the task
that a transition may be useful. This can drive more cooperative
scheduling of task_work.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/821f42b6-7d91-8074-8212-d34998097de4@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/task_work.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/task_work.c b/kernel/task_work.c
index c59e1a49bc40..dff75bcde151 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
  * @notify: how to notify the targeted task
  *
  * Queue @work for task_work_run() below and notify the @task if @notify
- * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
- * it will interrupt the targeted task and run the task_work. @TWA_RESUME
- * work is run only when the task exits the kernel and returns to user mode,
- * or before entering guest mode. Fails if the @task is exiting/exited and thus
- * it can't process this @work. Otherwise @work->func() will be called when the
- * @task goes through one of the aforementioned transitions, or exits.
+ * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
+ *
+ * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
+ * task and run the task_work, regardless of whether the task is currently
+ * running in the kernel or userspace.
+ * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
+ * reschedule IPI to force the targeted task to reschedule and run task_work.
+ * This can be advantageous if there's no strict requirement that the
+ * task_work be run as soon as possible, just whenever the task enters the
+ * kernel anyway.
+ * @TWA_RESUME work is run only when the task exits the kernel and returns to
+ * user mode, or before entering guest mode.
+ *
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task goes through one of
+ * the aforementioned transitions, or exits.
  *
  * If the targeted task is exiting, then an error is returned and the work item
  * is not queued. It's up to the caller to arrange for an alternative mechanism
@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 	case TWA_SIGNAL:
 		set_notify_signal(task);
 		break;
+	case TWA_SIGNAL_NO_IPI:
+		__set_notify_signal(task);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
-- 
cgit v1.2.3


From d664e399128bd78b905ff480917e2c2d4949e101 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Apr 2022 15:31:02 +0200
Subject: sched: Fix missing prototype warnings

A W=1 build emits more than a dozen missing prototype warnings related to
scheduler and scheduler specific includes.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220413133024.249118058@linutronix.de
---
 kernel/sched/build_policy.c  | 2 ++
 kernel/sched/build_utility.c | 1 +
 kernel/sched/core.c          | 3 +++
 kernel/sched/deadline.c      | 2 --
 kernel/sched/fair.c          | 1 +
 kernel/sched/sched.h         | 8 ++------
 kernel/sched/smp.h           | 6 ++++++
 kernel/stop_machine.c        | 2 --
 8 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index e0104b45029a..d9dc9ab3773f 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -15,6 +15,7 @@
 /* Headers: */
 #include <linux/sched/clock.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
 #include <linux/sched/posix-timers.h>
 #include <linux/sched/rt.h>
 
@@ -31,6 +32,7 @@
 #include <uapi/linux/sched/types.h>
 
 #include "sched.h"
+#include "smp.h"
 
 #include "autogroup.h"
 #include "stats.h"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index eec0849b2aae..99bdd96f454f 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -14,6 +14,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/rseq_api.h>
 #include <linux/sched/task_stack.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 068c088e9584..e644578bc871 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,7 +26,10 @@
 #include <linux/topology.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/mm.h>
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb4255ae0b2c..6ae423627a7a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 	return (dl_se->runtime <= 0);
 }
 
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
 /*
  * This function implements the GRUB accounting rule:
  * according to the GRUB reclaiming algorithm, the runtime is
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6ca054b11a16..bc9f6e94c84e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -36,6 +36,7 @@
 #include <linux/sched/cond_resched.h>
 #include <linux/sched/cputime.h>
 #include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
 
 #include <linux/cpuidle.h>
 #include <linux/interrupt.h>
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 762be73972bd..4784898e8f83 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1833,12 +1833,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
 #endif
 
 extern int sched_update_scaling(void);
-
-extern void flush_smp_call_function_from_idle(void);
-
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+#endif /* CONFIG_SMP */
 
 #include "stats.h"
 
@@ -2315,6 +2310,7 @@ extern void resched_cpu(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..5719bf9280e9 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -7,3 +7,9 @@
 extern void sched_ttwu_pending(void *arg);
 
 extern void send_call_function_single_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_from_idle(void);
+#else
+static inline void flush_smp_call_function_from_idle(void) { }
+#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index cbc30271ea4d..6da7b91af353 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -535,8 +535,6 @@ void stop_machine_park(int cpu)
 	kthread_park(stopper->thread);
 }
 
-extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-
 static void cpu_stop_create(unsigned int cpu)
 {
 	sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
-- 
cgit v1.2.3


From 16bf5a5e1ec56474ed2a19d72f272ed09a5d3ea1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Apr 2022 15:31:03 +0200
Subject: smp: Rename flush_smp_call_function_from_idle()

This is invoked from the stopper thread too, which is definitely not idle.
Rename it to flush_smp_call_function_queue() and fixup the callers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220413133024.305001096@linutronix.de
---
 kernel/sched/core.c |  2 +-
 kernel/sched/idle.c |  2 +-
 kernel/sched/smp.h  |  4 ++--
 kernel/smp.c        | 27 ++++++++++++++++++++-------
 4 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e644578bc871..07bacb050198 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2411,7 +2411,7 @@ static int migration_cpu_stop(void *data)
 	 * __migrate_task() such that we will not miss enforcing cpus_ptr
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
-	flush_smp_call_function_from_idle();
+	flush_smp_call_function_queue();
 
 	raw_spin_lock(&p->pi_lock);
 	rq_lock(rq, &rf);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f8b5020e76a..60295dbe0d62 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -327,7 +327,7 @@ static void do_idle(void)
 	 * RCU relies on this call to be done outside of an RCU read-side
 	 * critical section.
 	 */
-	flush_smp_call_function_from_idle();
+	flush_smp_call_function_queue();
 	schedule_idle();
 
 	if (unlikely(klp_patch_pending(current)))
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 5719bf9280e9..2eb23dd0f285 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -9,7 +9,7 @@ extern void sched_ttwu_pending(void *arg);
 extern void send_call_function_single_ipi(int cpu);
 
 #ifdef CONFIG_SMP
-extern void flush_smp_call_function_from_idle(void);
+extern void flush_smp_call_function_queue(void);
 #else
-static inline void flush_smp_call_function_from_idle(void) { }
+static inline void flush_smp_call_function_queue(void) { }
 #endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 01a7c1706a58..8e85f22ed538 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
-static void flush_smp_call_function_queue(bool warn_cpu_offline);
+static void __flush_smp_call_function_queue(bool warn_cpu_offline);
 
 int smpcfd_prepare_cpu(unsigned int cpu)
 {
@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
 	 * ensure that the outgoing CPU doesn't go offline with work
 	 * still pending.
 	 */
-	flush_smp_call_function_queue(false);
+	__flush_smp_call_function_queue(false);
 	irq_work_run();
 	return 0;
 }
@@ -541,11 +541,11 @@ void generic_smp_call_function_single_interrupt(void)
 {
 	cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
 		      smp_processor_id(), CFD_SEQ_GOTIPI);
-	flush_smp_call_function_queue(true);
+	__flush_smp_call_function_queue(true);
 }
 
 /**
- * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
  *
  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
  *		      offline CPU. Skip this check if set to 'false'.
@@ -558,7 +558,7 @@ void generic_smp_call_function_single_interrupt(void)
  * Loop through the call_single_queue and run all the queued callbacks.
  * Must be called with interrupts disabled.
  */
-static void flush_smp_call_function_queue(bool warn_cpu_offline)
+static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 {
 	call_single_data_t *csd, *csd_next;
 	struct llist_node *entry, *prev;
@@ -681,7 +681,20 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 		      smp_processor_id(), CFD_SEQ_HDLEND);
 }
 
-void flush_smp_call_function_from_idle(void)
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *				   from task context (idle, migration thread)
+ *
+ * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
+ * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
+ * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
+ * handle queued SMP function calls before scheduling.
+ *
+ * The migration thread has to ensure that an eventually pending wakeup has
+ * been handled before it migrates a task.
+ */
+void flush_smp_call_function_queue(void)
 {
 	unsigned long flags;
 
@@ -691,7 +704,7 @@ void flush_smp_call_function_from_idle(void)
 	cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
 		      smp_processor_id(), CFD_SEQ_IDLE);
 	local_irq_save(flags);
-	flush_smp_call_function_queue(true);
+	__flush_smp_call_function_queue(true);
 	if (local_softirq_pending())
 		do_softirq();
 
-- 
cgit v1.2.3


From 1a90bfd220201fbe050dfc15deaac20ca5f15638 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 13 Apr 2022 15:31:05 +0200
Subject: smp: Make softirq handling RT safe in flush_smp_call_function_queue()

flush_smp_call_function_queue() invokes do_softirq() which is not available
on PREEMPT_RT. flush_smp_call_function_queue() is invoked from the idle
task and the migration task with preemption or interrupts disabled.

So RT kernels cannot process soft interrupts in that context as that has to
acquire 'sleeping spinlocks' which is not possible with preemption or
interrupts disabled and forbidden from the idle task anyway.

The currently known SMP function call which raises a soft interrupt is in
the block layer, but this functionality is not enabled on RT kernels due to
latency and performance reasons.

RT could wake up ksoftirqd unconditionally, but this wants to be avoided if
there were soft interrupts pending already when this is invoked in the
context of the migration task. The migration task might have preempted a
threaded interrupt handler which raised a soft interrupt, but did not reach
the local_bh_enable() to process it. The "running" ksoftirqd might prevent
the handling in the interrupt thread context which is causing latency
issues.

Add a new function which handles this case explicitely for RT and falls
back to do_softirq() on !RT kernels. In the RT case this warns when one of
the flushed SMP function calls raised a soft interrupt so this can be
investigated.

[ tglx: Moved the RT part out of SMP code ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/YgKgL6aPj8aBES6G@linutronix.de
Link: https://lore.kernel.org/r/20220413133024.356509586@linutronix.de
---
 kernel/smp.c     |  5 ++++-
 kernel/softirq.c | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 8e85f22ed538..d54c2fe51ada 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -696,6 +696,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
  */
 void flush_smp_call_function_queue(void)
 {
+	unsigned int was_pending;
 	unsigned long flags;
 
 	if (llist_empty(this_cpu_ptr(&call_single_queue)))
@@ -704,9 +705,11 @@ void flush_smp_call_function_queue(void)
 	cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
 		      smp_processor_id(), CFD_SEQ_IDLE);
 	local_irq_save(flags);
+	/* Get the already pending soft interrupts for RT enabled kernels */
+	was_pending = local_softirq_pending();
 	__flush_smp_call_function_queue(true);
 	if (local_softirq_pending())
-		do_softirq();
+		do_softirq_post_smp_call_flush(was_pending);
 
 	local_irq_restore(flags);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fac801815554..9f0aef8aa9ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -294,6 +294,19 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 }
 
+/*
+ * flush_smp_call_function_queue() can raise a soft interrupt in a function
+ * call. On RT kernels this is undesired and the only known functionality
+ * in the block layer which does this is disabled on RT. If soft interrupts
+ * get raised which haven't been raised before the flush, warn so it can be
+ * investigated.
+ */
+void do_softirq_post_smp_call_flush(unsigned int was_pending)
+{
+	if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
+		invoke_softirq();
+}
+
 #else /* CONFIG_PREEMPT_RT */
 
 /*
-- 
cgit v1.2.3


From 2667ed10d9f01e250ba806276740782c89d77fda Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 28 Apr 2022 11:00:41 -0700
Subject: mm: Fix PASID use-after-free issue

The PASID is being freed too early.  It needs to stay around until after
device drivers that might be using it have had a chance to clear it out
of the hardware.

The relevant refcounts are:

  mmget() /mmput()  refcount the mm's address space
  mmgrab()/mmdrop() refcount the mm itself

The PASID is currently tied to the life of the mm's address space and freed
in __mmput().  This makes logical sense because the PASID can't be used
once the address space is gone.

But, this misses an important point: even after the address space is gone,
the PASID will still be programmed into a device.  Device drivers might,
for instance, still need to flush operations that are outstanding and need
to use that PASID.  They do this at file->release() time.

Device drivers call the IOMMU driver to hold a reference on the mm itself
and drop it at file->release() time.  But, the IOMMU driver holds a
reference on the mm itself, not the address space.  The address space (and
the PASID) is long gone by the time the driver tries to clean up.  This is
effectively a use-after-free bug on the PASID.

To fix this, move the PASID free operation from __mmput() to __mmdrop().
This ensures that the IOMMU driver's existing mmgrab() keeps the PASID
allocated until it drops its mm reference.

Fixes: 701fac40384f ("iommu/sva: Assign a PASID to mm on PASID allocation and free it on mm exit")
Reported-by: Zhangfei Gao <zhangfei.gao@foxmail.com>
Suggested-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Suggested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Zhangfei Gao <zhangfei.gao@foxmail.com>
Reviewed-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20220428180041.806809-1-fenghua.yu@intel.com
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..35a3beff140b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -792,6 +792,7 @@ void __mmdrop(struct mm_struct *mm)
 	mmu_notifier_subscriptions_destroy(mm);
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
+	mm_pasid_drop(mm);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1190,7 +1191,6 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
-	mm_pasid_drop(mm);
 	mmdrop(mm);
 }
 
-- 
cgit v1.2.3


From eff4849f928f2b90402907e06a6de1619cf16b1a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Apr 2022 11:19:35 +0200
Subject: timekeeping: Annotate ktime_get_boot_fast_ns() with data_race()

Accessing timekeeper::offset_boot in ktime_get_boot_fast_ns() is an
intended data race as the reader side cannot synchronize with a writer and
there is no space in struct tk_read_base of the NMI safe timekeeper.

Mark it so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220415091920.956045162@linutronix.de
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2c22023fbf5f..3479804ed5e6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -528,7 +528,7 @@ u64 notrace ktime_get_boot_fast_ns(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 
-	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
-- 
cgit v1.2.3


From 90be8d6c1f91e1e5121c219726524c91b52bfc20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Apr 2022 11:19:38 +0200
Subject: timekeeping: Consolidate fast timekeeper

Provide a inline function which replaces the copy & pasta.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220415091921.072296632@linutronix.de
---
 kernel/time/timekeeping.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3479804ed5e6..8895ff20f82d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -429,6 +429,14 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
 	memcpy(base + 1, base, sizeof(*base));
 }
 
+static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr)
+{
+	u64 delta, cycles = tk_clock_read(tkr);
+
+	delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+	return timekeeping_delta_to_ns(tkr, delta);
+}
+
 static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 {
 	struct tk_read_base *tkr;
@@ -439,12 +447,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base);
-
-		now += timekeeping_delta_to_ns(tkr,
-				clocksource_delta(
-					tk_clock_read(tkr),
-					tkr->cycle_last,
-					tkr->mask));
+		now += fast_tk_get_delta_ns(tkr);
 	} while (read_seqcount_latch_retry(&tkf->seq, seq));
 
 	return now;
@@ -560,10 +563,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 		tkr = tkf->base + (seq & 0x01);
 		basem = ktime_to_ns(tkr->base);
 		baser = ktime_to_ns(tkr->base_real);
-
-		delta = timekeeping_delta_to_ns(tkr,
-				clocksource_delta(tk_clock_read(tkr),
-				tkr->cycle_last, tkr->mask));
+		delta = fast_tk_get_delta_ns(tkr);
 	} while (read_seqcount_latch_retry(&tkf->seq, seq));
 
 	if (mono)
-- 
cgit v1.2.3


From ce4818957fdc5bca57fc2c92b0dfe109d26bcc47 Mon Sep 17 00:00:00 2001
From: Minghao Chi <chi.minghao@zte.com.cn>
Date: Mon, 18 Apr 2022 11:07:16 +0000
Subject: genirq: Use pm_runtime_resume_and_get() instead of
 pm_runtime_get_sync()

pm_runtime_resume_and_get() achieves the same and simplifies the code.

[ tglx: Simplify it further by presetting retval ]

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220418110716.2559453-1-chi.minghao@zte.com.cn
---
 kernel/irq/chip.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 54af0deb239b..e6b8e564b37f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1573,17 +1573,12 @@ static struct device *irq_get_parent_device(struct irq_data *data)
 int irq_chip_pm_get(struct irq_data *data)
 {
 	struct device *dev = irq_get_parent_device(data);
-	int retval;
+	int retval = 0;
 
-	if (IS_ENABLED(CONFIG_PM) && dev) {
-		retval = pm_runtime_get_sync(dev);
-		if (retval < 0) {
-			pm_runtime_put_noidle(dev);
-			return retval;
-		}
-	}
+	if (IS_ENABLED(CONFIG_PM) && dev)
+		retval = pm_runtime_resume_and_get(dev);
 
-	return 0;
+	return retval;
 }
 
 /**
-- 
cgit v1.2.3


From 92067440f1311dfa4d77b57a9da6b3706f5da32e Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sun, 24 Apr 2022 12:47:20 +0100
Subject: time/sched_clock: Round the frequency reported to nearest rather than
 down

The frequency reported for clock sources are rounded down, which gives
misleading figures, e.g.:

 I/O ASIC clock frequency 24999480Hz
 sched_clock: 32 bits at 24MHz, resolution 40ns, wraps every 85901132779ns
 MIPS counter frequency 59998512Hz
 sched_clock: 32 bits at 59MHz, resolution 16ns, wraps every 35792281591ns

Rounding to nearest is more adequate:

 I/O ASIC clock frequency 24999664Hz
 sched_clock: 32 bits at 25MHz, resolution 40ns, wraps every 85900499947ns
 MIPS counter frequency 59999728Hz
 sched_clock: 32 bits at 60MHz, resolution 16ns, wraps every 35791556599ns

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204240055590.9383@angie.orcam.me.uk
---
 kernel/time/sched_clock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index b1b9b12899f5..ee07f3ac1e5b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,6 +8,7 @@
 #include <linux/jiffies.h>
 #include <linux/ktime.h>
 #include <linux/kernel.h>
+#include <linux/math.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
@@ -199,11 +200,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 
 	r = rate;
 	if (r >= 4000000) {
-		r /= 1000000;
+		r = DIV_ROUND_CLOSEST(r, 1000000);
 		r_unit = 'M';
 	} else {
 		if (r >= 1000) {
-			r /= 1000;
+			r = DIV_ROUND_CLOSEST(r, 1000);
 			r_unit = 'k';
 		} else {
 			r_unit = ' ';
-- 
cgit v1.2.3


From cc1b923a4e378c943386e7fe6205918d43e5fede Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sun, 24 Apr 2022 12:47:26 +0100
Subject: time/sched_clock: Use Hz as the unit for clock rate reporting below
 4kHz

The kernel uses kHz as the unit for clock rates reported between 1MHz
(inclusive) and 4MHz (exclusive), e.g.:

 sched_clock: 64 bits at 1000kHz, resolution 1000ns, wraps every 2199023255500ns

This reduces the amount of data lost due to rounding, but hasn't been
replicated for the kHz range when support was added for proper reporting of
sub-kHz clock rates.  Take the same approach for rates between 1kHz
(inclusive) and 4kHz (exclusive), which makes it consistent.

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204240106380.9383@angie.orcam.me.uk
---
 kernel/time/sched_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ee07f3ac1e5b..4a95c0be9daf 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -203,7 +203,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 		r = DIV_ROUND_CLOSEST(r, 1000000);
 		r_unit = 'M';
 	} else {
-		if (r >= 1000) {
+		if (r >= 4000) {
 			r = DIV_ROUND_CLOSEST(r, 1000);
 			r_unit = 'k';
 		} else {
-- 
cgit v1.2.3


From f4b62e1e1137507268c2c63dc4e6da279dc58e9f Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sun, 24 Apr 2022 12:47:30 +0100
Subject: time/sched_clock: Fix formatting of frequency reporting code

Use flat rather than nested indentation for chained else/if clauses as
per coding-style.rst:

	if (x == y) {
		..
	} else if (x > y) {
		...
	} else {
		....
	}

This also improves readability.

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204240148220.9383@angie.orcam.me.uk
---
 kernel/time/sched_clock.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4a95c0be9daf..8464c5acc913 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -202,13 +202,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	if (r >= 4000000) {
 		r = DIV_ROUND_CLOSEST(r, 1000000);
 		r_unit = 'M';
+	} else if (r >= 4000) {
+		r = DIV_ROUND_CLOSEST(r, 1000);
+		r_unit = 'k';
 	} else {
-		if (r >= 4000) {
-			r = DIV_ROUND_CLOSEST(r, 1000);
-			r_unit = 'k';
-		} else {
-			r_unit = ' ';
-		}
+		r_unit = ' ';
 	}
 
 	/* Calculate the ns resolution of this counter */
-- 
cgit v1.2.3


From f4a6a61cb6d40d9ae63e47743d33200f3efe3fe7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Apr 2022 06:27:16 +0200
Subject: blktrace: cleanup the __trace_note_message interface

Pass the cgroup_subsys_state instead of a the blkg so that blktrace
doesn't need to poke into blk-cgroup internals, and give the name a
blk prefix as the current name is way too generic for a public
interface.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220420042723.1010598-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 4d5629196d01..9ef349ac49c0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt)
 	local_irq_restore(flags);
 }
 
-void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
-	const char *fmt, ...)
+void __blk_trace_note_message(struct blk_trace *bt,
+		struct cgroup_subsys_state *css, const char *fmt, ...)
 {
 	int n;
 	va_list args;
 	unsigned long flags;
 	char *buf;
+	u64 cgid = 0;
 
 	if (unlikely(bt->trace_state != Blktrace_running &&
 		     !blk_tracer_enabled))
@@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
-		blkcg = NULL;
 #ifdef CONFIG_BLK_CGROUP
-	trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
-		   blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
-#else
-	trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
+	if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+		cgid = cgroup_id(css->cgroup);
+	else
+		cgid = 1;
 #endif
+	trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(__trace_note_message);
+EXPORT_SYMBOL_GPL(__blk_trace_note_message);
 
 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 			 pid_t pid)
@@ -411,7 +411,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 		return PTR_ERR(msg);
 
 	bt = filp->private_data;
-	__trace_note_message(bt, NULL, "%s", msg);
+	__blk_trace_note_message(bt, NULL, "%s", msg);
 	kfree(msg);
 
 	return count;
-- 
cgit v1.2.3


From bbb1ebe7a909db4de49777fb7676d5bf293f34c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Apr 2022 06:27:17 +0200
Subject: blk-cgroup: replace bio_blkcg with bio_blkcg_css

All callers of bio_blkcg actually want the CSS, so replace it with an
interface that does return the CSS.  This now allows to move
struct blkcg_gq to block/blk-cgroup.h instead of exposing it in a
public header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220420042723.1010598-10-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9ef349ac49c0..10a32b0f2deb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -783,6 +783,7 @@ void blk_trace_shutdown(struct request_queue *q)
 #ifdef CONFIG_BLK_CGROUP
 static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 {
+	struct cgroup_subsys_state *blkcg_css;
 	struct blk_trace *bt;
 
 	/* We don't use the 'bt' value here except as an optimization... */
@@ -790,9 +791,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 		return 0;
 
-	if (!bio->bi_blkg)
+	blkcg_css = bio_blkcg_css(bio);
+	if (!blkcg_css)
 		return 0;
-	return cgroup_id(bio_blkcg(bio)->css.cgroup);
+	return cgroup_id(blkcg_css->cgroup);
 }
 #else
 static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
-- 
cgit v1.2.3


From f624506f98b198e65b44da303f44974590fb16c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 20 Apr 2022 06:27:23 +0200
Subject: kthread: unexport kthread_blkcg

kthread_blkcg is only used by the built-in blk-cgroup code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20220420042723.1010598-16-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/kthread.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 50265f69a135..544fd4097406 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1522,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void)
 	}
 	return NULL;
 }
-EXPORT_SYMBOL(kthread_blkcg);
 #endif
-- 
cgit v1.2.3


From a57ffb3c6b67e59e8632f731414b792eacc6cca0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 31 Jan 2022 11:21:30 -0800
Subject: srcu: Automatically determine size-transition strategy at boot

This commit adds a srcutree.convert_to_big option of zero that causes
SRCU to decide at boot whether to wait for contention (small systems) or
immediately expand to large (large systems).  A new srcutree.big_cpu_lim
(defaulting to 128) defines how many CPUs constitute a large system.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0bc6a0a3edee..b9dec26245e0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -41,10 +41,10 @@ module_param(counter_wrap_check, ulong, 0444);
 
 /*
  * Control conversion to SRCU_SIZE_BIG:
- *    0: Don't convert at all (default).
+ *    0: Don't convert at all.
  *    1: Convert at init_srcu_struct() time.
  *    2: Convert when rcutorture invokes srcu_torture_stats_print().
- *    3: Decide at boot time based on system shape.
+ *    3: Decide at boot time based on system shape (default).
  * 0x1x: Convert when excessive contention encountered.
  */
 #define SRCU_SIZING_NONE	0
@@ -57,9 +57,13 @@ module_param(counter_wrap_check, ulong, 0444);
 #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
 #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
 #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
-static int convert_to_big = SRCU_SIZING_NONE;
+static int convert_to_big = SRCU_SIZING_AUTO;
 module_param(convert_to_big, int, 0444);
 
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
 /* Contention events per jiffy to initiate transition to big. */
 static int small_contention_lim __read_mostly = 100;
 module_param(small_contention_lim, int, 0444);
@@ -1619,6 +1623,17 @@ void __init srcu_init(void)
 {
 	struct srcu_struct *ssp;
 
+	/* Decide on srcu_struct-size strategy. */
+	if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+		if (nr_cpu_ids >= big_cpu_lim) {
+			convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+			pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+		} else {
+			convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+			pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+		}
+	}
+
 	/*
 	 * Once that is set, call_srcu() can follow the normal path and
 	 * queue delayed work. This must follow RCU workqueues creation
@@ -1629,6 +1644,8 @@ void __init srcu_init(void)
 		ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
 				      work.work.entry);
 		list_del_init(&ssp->work.work.entry);
+		if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
+			ssp->srcu_size_state = SRCU_SIZE_ALLOC;
 		queue_work(rcu_gp_wq, &ssp->work.work);
 	}
 }
-- 
cgit v1.2.3


From c2445d38785086422e56dcbe049b73a53b2ba81f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 31 Jan 2022 13:27:15 -0800
Subject: srcu: Add contention check to call_srcu() srcu_data ->lock
 acquisition

This commit increases the sensitivity of contention detection by adding
checks to the acquisition of the srcu_data structure's lock on the
call_srcu() code path.

Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b9dec26245e0..862008c147b0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -330,18 +330,13 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
 }
 
 /*
- * Acquire the specified srcu_struct structure's ->lock, but check for
- * excessive contention, which results in initiation of a transition
- * to SRCU_SIZE_BIG.  But only if the srcutree.convert_to_big module
- * parameter permits this.
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
  */
-static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
 {
 	unsigned long j;
 
-	if (spin_trylock_irqsave_rcu_node(ssp, *flags))
-		return;
-	spin_lock_irqsave_rcu_node(ssp, *flags);
 	if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
 		return;
 	j = jiffies;
@@ -354,6 +349,38 @@ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned l
 	__srcu_transition_to_big(ssp);
 }
 
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG.  But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+	struct srcu_struct *ssp = sdp->ssp;
+
+	if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+		return;
+	spin_lock_irqsave_rcu_node(ssp, *flags);
+	spin_lock_irqsave_check_contention(ssp);
+	spin_unlock_irqrestore_rcu_node(ssp, *flags);
+	spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG.  But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+	if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+		return;
+	spin_lock_irqsave_rcu_node(ssp, *flags);
+	spin_lock_irqsave_check_contention(ssp);
+}
+
 /*
  * First-use initialization of statically allocated srcu_struct
  * structure.  Wiring up the combining tree is more than can be
@@ -1010,7 +1037,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 		sdp = per_cpu_ptr(ssp->sda, 0);
 	else
 		sdp = raw_cpu_ptr(ssp->sda);
-	spin_lock_irqsave_rcu_node(sdp, flags);
+	spin_lock_irqsave_sdp_contention(sdp, &flags);
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
-- 
cgit v1.2.3


From 282d8998e9979c2186af7f7d22366f2fc3149838 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 8 Mar 2022 15:45:33 -0800
Subject: srcu: Prevent expedited GPs and blocking readers from consuming CPU

If an SRCU reader blocks while a synchronize_srcu_expedited() waits for
that same reader, then that grace period will spawn an endless series of
workqueue handlers, consuming a full CPU.  This quickly gets pointless
because consuming more CPU isn't going to make that reader get done
faster, especially if it is blocked waiting for an external event.

This commit therefore spawns at most one pair of back-to-back workqueue
handlers per expedited grace period phase, instead inserting increasing
delays as that grace period phase grows older, but capped at 10 jiffies.
In any case, if there have been at least 100 back-to-back workqueue
handlers within a single jiffy, regardless of grace period or grace-period
phase, then a one-jiffy delay is inserted.

[ paulmck:  Apply feedback from kernel test robot. ]

Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reported-by: Song Liu <song@kernel.org>
Tested-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 862008c147b0..6dd44e759f12 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -511,7 +511,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
 	return sum;
 }
 
-#define SRCU_INTERVAL		1
+#define SRCU_INTERVAL		1	// Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL	10	// Maximum incremental delay from slow readers.
+#define SRCU_MAX_NODELAY_PHASE	1	// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_MAX_NODELAY	100	// Maximum consecutive no-delay instances.
 
 /*
  * Return grace-period delay, zero if there are expedited grace
@@ -519,9 +522,18 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
  */
 static unsigned long srcu_get_delay(struct srcu_struct *ssp)
 {
+	unsigned long jbase = SRCU_INTERVAL;
+
 	if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
-		return 0;
-	return SRCU_INTERVAL;
+		jbase = 0;
+	if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
+		jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
+	if (!jbase) {
+		WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+		if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
+			jbase = 1;
+	}
+	return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
 }
 
 /**
@@ -623,6 +635,8 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
 				       rcu_seq_snap(&ssp->srcu_gp_seq));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
+	WRITE_ONCE(ssp->srcu_gp_start, jiffies);
+	WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
 	rcu_seq_start(&ssp->srcu_gp_seq);
 	state = rcu_seq_state(ssp->srcu_gp_seq);
@@ -706,7 +720,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
 	spin_lock_irq_rcu_node(ssp);
 	idx = rcu_seq_state(ssp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
-	cbdelay = srcu_get_delay(ssp);
+	cbdelay = !!srcu_get_delay(ssp);
 	WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
 	rcu_seq_end(&ssp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -893,7 +907,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
 		// the one boot CPU running with interrupts still disabled.
 		if (likely(srcu_init_done))
 			queue_delayed_work(rcu_gp_wq, &ssp->work,
-					   srcu_get_delay(ssp));
+					   !!srcu_get_delay(ssp));
 		else if (list_empty(&ssp->work.work.entry))
 			list_add(&ssp->work.work.entry, &srcu_boot_list);
 	}
@@ -1448,6 +1462,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 		srcu_flip(ssp);
 		spin_lock_irq_rcu_node(ssp);
 		rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+		ssp->srcu_n_exp_nodelay = 0;
 		spin_unlock_irq_rcu_node(ssp);
 	}
 
@@ -1462,6 +1477,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
 			mutex_unlock(&ssp->srcu_gp_mutex);
 			return; /* readers present, retry later. */
 		}
+		ssp->srcu_n_exp_nodelay = 0;
 		srcu_gp_end(ssp);  /* Releases ->srcu_gp_mutex. */
 	}
 }
@@ -1552,12 +1568,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
  */
 static void process_srcu(struct work_struct *work)
 {
+	unsigned long curdelay;
+	unsigned long j;
 	struct srcu_struct *ssp;
 
 	ssp = container_of(work, struct srcu_struct, work.work);
 
 	srcu_advance_state(ssp);
-	srcu_reschedule(ssp, srcu_get_delay(ssp));
+	curdelay = srcu_get_delay(ssp);
+	if (curdelay) {
+		WRITE_ONCE(ssp->reschedule_count, 0);
+	} else {
+		j = jiffies;
+		if (READ_ONCE(ssp->reschedule_jiffies) == j) {
+			WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
+			if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+				curdelay = 1;
+		} else {
+			WRITE_ONCE(ssp->reschedule_count, 1);
+			WRITE_ONCE(ssp->reschedule_jiffies, j);
+		}
+	}
+	srcu_reschedule(ssp, curdelay);
 }
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
-- 
cgit v1.2.3


From 586e31d59c436cda65a2e8ac04ff954bed247023 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Tue, 15 Mar 2022 09:55:49 +0100
Subject: srcu: Drop needless initialization of sdp in srcu_gp_start()

Commit 9c7ef4c30f12 ("srcu: Make Tree SRCU able to operate without
snp_node array") initializes the local variable sdp differently depending
on the srcu's state in srcu_gp_start().  Either way, this initialization
overwrites the value used when sdp is defined.

This commit therefore drops this pointless definition-time initialization.
Although there is no functional change, compiler code generation may
be affected.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6dd44e759f12..50ba70f019de 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *ssp)
 {
-	struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
+	struct srcu_data *sdp;
 	int state;
 
 	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
-- 
cgit v1.2.3


From c2aa2dfef243efe213a480a1ee8566507a5152f4 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Tue, 3 May 2022 01:09:56 -0700
Subject: seccomp: Add wait_killable semantic to seccomp user notifier

This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
that makes it so that when notifications are received by the supervisor the
notifying process will transition to wait killable semantics. Although wait
killable isn't a set of semantics formally exposed to userspace, the
concept is searchable. If the notifying process is signaled prior to the
notification being received by the userspace agent, it will be handled as
normal.

One quirk about how this is handled is that the notifying process
only switches to TASK_KILLABLE if it receives a wakeup from either
an addfd or a signal. This is to avoid an unnecessary wakeup of
the notifying task.

The reasons behind switching into wait_killable only after userspace
receives the notification are:
* Avoiding unncessary work - Often, workloads will perform work that they
  may abort (request racing comes to mind). This allows for syscalls to be
  aborted safely prior to the notification being received by the
  supervisor. In this, the supervisor doesn't end up doing work that the
  workload does not want to complete anyways.
* Avoiding side effects - We don't want the syscall to be interruptible
  once the supervisor starts doing work because it may not be trivial
  to reverse the operation. For example, unmounting a file system may
  take a long time, and it's hard to rollback, or treat that as
  reentrant.
* Avoid breaking runtimes - Various runtimes do not GC when they are
  during a syscall (or while running native code that subsequently
  calls a syscall). If many notifications are blocked, and not picked
  up by the supervisor, this can get the application into a bad state.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
---
 kernel/seccomp.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3caa0fe85235..e9852d1b4a5e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
  *	   the filter can be freed.
  * @cache: cache of arch/syscall mappings to actions
  * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ *			notification is received by the userspace listener.
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  * @notif: the struct that holds all notification related information
@@ -220,6 +222,7 @@ struct seccomp_filter {
 	refcount_t refs;
 	refcount_t users;
 	bool log;
+	bool wait_killable_recv;
 	struct action_cache cache;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
@@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags,
 	if (flags & SECCOMP_FILTER_FLAG_LOG)
 		filter->log = true;
 
+	/* Set wait killable flag, if present. */
+	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+		filter->wait_killable_recv = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
 	complete(&addfd->completion);
 }
 
+static bool should_sleep_killable(struct seccomp_filter *match,
+				  struct seccomp_knotif *n)
+{
+	return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT;
+}
+
 static int seccomp_do_user_notification(int this_syscall,
 					struct seccomp_filter *match,
 					const struct seccomp_data *sd)
@@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall,
 	 * This is where we wait for a reply from userspace.
 	 */
 	do {
+		bool wait_killable = should_sleep_killable(match, &n);
+
 		mutex_unlock(&match->notify_lock);
-		err = wait_for_completion_interruptible(&n.ready);
+		if (wait_killable)
+			err = wait_for_completion_killable(&n.ready);
+		else
+			err = wait_for_completion_interruptible(&n.ready);
 		mutex_lock(&match->notify_lock);
-		if (err != 0)
+
+		if (err != 0) {
+			/*
+			 * Check to see if the notifcation got picked up and
+			 * whether we should switch to wait killable.
+			 */
+			if (!wait_killable && should_sleep_killable(match, &n))
+				continue;
+
 			goto interrupted;
+		}
 
 		addfd = list_first_entry_or_null(&n.addfd,
 						 struct seccomp_kaddfd, list);
@@ -1484,6 +1511,9 @@ out:
 		mutex_lock(&filter->notify_lock);
 		knotif = find_notification(filter, unotif.id);
 		if (knotif) {
+			/* Reset the process to make sure it's not stuck */
+			if (should_sleep_killable(filter, knotif))
+				complete(&knotif->ready);
 			knotif->state = SECCOMP_NOTIFY_INIT;
 			up(&filter->notif->request);
 		}
@@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	    ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
 		return -EINVAL;
 
+	/*
+	 * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+	 * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
+	 */
+	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+	    ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
+		return -EINVAL;
+
 	/* Prepare the new filter before holding any locks. */
 	prepared = seccomp_prepare_user_filter(filter);
 	if (IS_ERR(prepared))
-- 
cgit v1.2.3


From 8707898e22fd665bc1d7b18b809be4b56ce25bdd Mon Sep 17 00:00:00 2001
From: Thomas Pfaff <tpfaff@pcs.com>
Date: Mon, 2 May 2022 13:28:29 +0200
Subject: genirq: Synchronize interrupt thread startup

A kernel hang can be observed when running setserial in a loop on a kernel
with force threaded interrupts. The sequence of events is:

   setserial
     open("/dev/ttyXXX")
       request_irq()
     do_stuff()
      -> serial interrupt
         -> wake(irq_thread)
	      desc->threads_active++;
     close()
       free_irq()
         kthread_stop(irq_thread)
     synchronize_irq() <- hangs because desc->threads_active != 0

The thread is created in request_irq() and woken up, but does not get on a
CPU to reach the actual thread function, which would handle the pending
wake-up. kthread_stop() sets the should stop condition which makes the
thread immediately exit, which in turn leaves the stale threads_active
count around.

This problem was introduced with commit 519cc8652b3a, which addressed a
interrupt sharing issue in the PCIe code.

Before that commit free_irq() invoked synchronize_irq(), which waits for
the hard interrupt handler and also for associated threads to complete.

To address the PCIe issue synchronize_irq() was replaced with
__synchronize_hardirq(), which only waits for the hard interrupt handler to
complete, but not for threaded handlers.

This was done under the assumption, that the interrupt thread already
reached the thread function and waits for a wake-up, which is guaranteed to
be handled before acting on the stop condition. The problematic case, that
the thread would not reach the thread function, was obviously overlooked.

Make sure that the interrupt thread is really started and reaches
thread_fn() before returning from __setup_irq().

This utilizes the existing wait queue in the interrupt descriptor. The
wait queue is unused for non-shared interrupts. For shared interrupts the
usage might cause a spurious wake-up of a waiter in synchronize_irq() or the
completion of a threaded handler might cause a spurious wake-up of the
waiter for the ready flag. Both are harmless and have no functional impact.

[ tglx: Amended changelog ]

Fixes: 519cc8652b3a ("genirq: Synchronize only with single thread on free_irq()")
Signed-off-by: Thomas Pfaff <tpfaff@pcs.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/552fe7b4-9224-b183-bb87-a8f36d335690@pcs.com
---
 kernel/irq/internals.h |  2 ++
 kernel/irq/irqdesc.c   |  2 ++
 kernel/irq/manage.c    | 39 +++++++++++++++++++++++++++++----------
 3 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99cbdf55a8bd..f09c60393e55 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -29,12 +29,14 @@ extern struct irqaction chained_action;
  * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
  * IRQTF_FORCED_THREAD  - irq action is force threaded
+ * IRQTF_READY     - signals that irq thread is ready
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_WARNED,
 	IRQTF_AFFINITY,
 	IRQTF_FORCED_THREAD,
+	IRQTF_READY,
 };
 
 /*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 939d21cd55c3..0099b87dd853 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	mutex_init(&desc->request_mutex);
 	init_rcu_head(&desc->rcu);
+	init_waitqueue_head(&desc->wait_for_threads);
 
 	desc_set_defaults(irq, desc, node, affinity, owner);
 	irqd_set(&desc->irq_data, flags);
@@ -575,6 +576,7 @@ int __init early_irq_init(void)
 		raw_spin_lock_init(&desc[i].lock);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		mutex_init(&desc[i].request_mutex);
+		init_waitqueue_head(&desc[i].wait_for_threads);
 		desc_set_defaults(i, &desc[i], node, NULL, NULL);
 	}
 	return arch_early_irq_init();
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c03f71d5ec10..e3e245a4fd70 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1248,6 +1248,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
 	raw_spin_unlock_irq(&desc->lock);
 }
 
+/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+				 struct irqaction *action)
+{
+	set_bit(IRQTF_READY, &action->thread_flags);
+	wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+						  struct irqaction *action)
+{
+	if (!action || !action->thread)
+		return;
+
+	wake_up_process(action->thread);
+	wait_event(desc->wait_for_threads,
+		   test_bit(IRQTF_READY, &action->thread_flags));
+}
+
 /*
  * Interrupt handler thread
  */
@@ -1259,6 +1284,8 @@ static int irq_thread(void *data)
 	irqreturn_t (*handler_fn)(struct irq_desc *desc,
 			struct irqaction *action);
 
+	irq_thread_set_ready(desc, action);
+
 	sched_set_fifo(current);
 
 	if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
@@ -1683,8 +1710,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	}
 
 	if (!shared) {
-		init_waitqueue_head(&desc->wait_for_threads);
-
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			ret = __irq_set_trigger(desc,
@@ -1780,14 +1805,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	irq_setup_timings(desc, new);
 
-	/*
-	 * Strictly no need to wake it up, but hung_task complains
-	 * when no hard interrupt wakes the thread up.
-	 */
-	if (new->thread)
-		wake_up_process(new->thread);
-	if (new->secondary)
-		wake_up_process(new->secondary->thread);
+	wake_up_and_wait_for_irq_thread_ready(desc, new);
+	wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
 
 	register_irq_proc(irq, desc);
 	new->dir = NULL;
-- 
cgit v1.2.3


From 2685027fca387b602ae565bff17895188b803988 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 27 Apr 2022 10:54:28 -0400
Subject: cgroup/cpuset: Remove cpus_allowed/mems_allowed setup in
 cpuset_init_smp()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are 3 places where the cpu and node masks of the top cpuset can
be initialized in the order they are executed:
 1) start_kernel -> cpuset_init()
 2) start_kernel -> cgroup_init() -> cpuset_bind()
 3) kernel_init_freeable() -> do_basic_setup() -> cpuset_init_smp()

The first cpuset_init() call just sets all the bits in the masks.
The second cpuset_bind() call sets cpus_allowed and mems_allowed to the
default v2 values. The third cpuset_init_smp() call sets them back to
v1 values.

For systems with cgroup v2 setup, cpuset_bind() is called once.  As a
result, cpu and memory node hot add may fail to update the cpu and node
masks of the top cpuset to include the newly added cpu or node in a
cgroup v2 environment.

For systems with cgroup v1 setup, cpuset_bind() is called again by
rebind_subsystem() when the v1 cpuset filesystem is mounted as shown
in the dmesg log below with an instrumented kernel.

  [    2.609781] cpuset_bind() called - v2 = 1
  [    3.079473] cpuset_init_smp() called
  [    7.103710] cpuset_bind() called - v2 = 0

smp_init() is called after the first two init functions.  So we don't
have a complete list of active cpus and memory nodes until later in
cpuset_init_smp() which is the right time to set up effective_cpus
and effective_mems.

To fix this cgroup v2 mask setup problem, the potentially incorrect
cpus_allowed & mems_allowed setting in cpuset_init_smp() are removed.
For cgroup v2 systems, the initial cpuset_bind() call will set the masks
correctly.  For cgroup v1 systems, the second call to cpuset_bind()
will do the right setup.

cc: stable@vger.kernel.org
Signed-off-by: Waiman Long <longman@redhat.com>
Tested-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 9390bfd9f1cd..71a418858a5e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3390,8 +3390,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
  */
 void __init cpuset_init_smp(void)
 {
-	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
-	top_cpuset.mems_allowed = node_states[N_MEMORY];
+	/*
+	 * cpus_allowd/mems_allowed set to v2 values in the initial
+	 * cpuset_bind() call will be reset to v1 values in another
+	 * cpuset_bind() call when v1 cpuset is mounted.
+	 */
 	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
 
 	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
-- 
cgit v1.2.3


From 701850dc0c31bfadf75a0a74af7d2c97859945ec Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 3 May 2022 09:38:44 +0200
Subject: printk, tracing: fix console tracepoint

The original intent of the 'console' tracepoint per the commit 95100358491a
("printk/tracing: Add console output tracing") had been to "[...] record
any printk messages into the trace, regardless of the current console
loglevel. This can help correlate (existing) printk debugging with other
tracing."

Petr points out [1] that calling trace_console_rcuidle() in
call_console_driver() had been the wrong thing for a while, because
"printk() always used console_trylock() and the message was flushed to
the console only when the trylock succeeded. And it was always deferred
in NMI or when printed via printk_deferred()."

With the commit 09c5ba0aa2fc ("printk: add kthread console printers"),
things only got worse, and calls to call_console_driver() no longer
happen with typical printk() calls but always appear deferred [2].

As such, the tracepoint can no longer serve its purpose to clearly
correlate printk() calls and other tracing, as well as breaks usecases
that expect every printk() call to result in a callback of the console
tracepoint. Notably, the KFENCE and KCSAN test suites, which want to
capture console output and assume a printk() immediately gives us a
callback to the console tracepoint.

Fix the console tracepoint by moving it into printk_sprint() [3].

One notable difference is that by moving tracing into printk_sprint(),
the 'text' will no longer include the "header" (loglevel and timestamp),
but only the raw message. Arguably this is less of a problem now that
the console tracepoint happens on the printk() call and isn't delayed.

Link: https://lore.kernel.org/all/Ym+WqKStCg%2FEHfh3@alley/ [1]
Link: https://lore.kernel.org/all/CA+G9fYu2kS0wR4WqMRsj2rePKV9XLgOU1PiXnMvpT+Z=c2ucHA@mail.gmail.com/ [2]
Link: https://lore.kernel.org/all/87fslup9dx.fsf@jogness.linutronix.de/ [3]
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Signed-off-by: Marco Elver <elver@google.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: John Ogness <john.ogness@linutronix.de>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220503073844.4148944-1-elver@google.com
---
 kernel/printk/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f66d6e72a642..a3e1035929b0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2064,8 +2064,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le
 {
 	size_t dropped_len;
 
-	trace_console_rcuidle(text, len);
-
 	if (con->dropped && dropped_text) {
 		dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
 				       "** %lu printk messages dropped **\n",
@@ -2240,6 +2238,8 @@ static u16 printk_sprint(char *text, u16 size, int facility,
 		}
 	}
 
+	trace_console_rcuidle(text, text_len);
+
 	return text_len;
 }
 
-- 
cgit v1.2.3


From 343f4c49f2438d8920f1f76fa823ee59b91f02e4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 11 Apr 2022 11:40:14 -0500
Subject: kthread: Don't allocate kthread_struct for init and umh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If kthread_is_per_cpu runs concurrently with free_kthread_struct the
kthread_struct that was just freed may be read from.

This bug was introduced by commit 40966e316f86 ("kthread: Ensure
struct kthread is present for all kthreads").  When kthread_struct
started to be allocated for all tasks that have PF_KTHREAD set.  This
in turn required the kthread_struct to be freed in kernel_execve and
violated the assumption that kthread_struct will have the same
lifetime as the task.

Looking a bit deeper this only applies to callers of kernel_execve
which is just the init process and the user mode helper processes.
These processes really don't want to be kernel threads but are for
historical reasons.  Mostly that copy_thread does not know how to take
a kernel mode function to the process with for processes without
PF_KTHREAD or PF_IO_WORKER set.

Solve this by not allocating kthread_struct for the init process and
the user mode helper processes.

This is done by adding a kthread member to struct kernel_clone_args.
Setting kthread in fork_idle and kernel_thread.  Adding
user_mode_thread that works like kernel_thread except it does not set
kthread.  In fork only allocating the kthread_struct if .kthread is set.

I have looked at kernel/kthread.c and since commit 40966e316f86
("kthread: Ensure struct kthread is present for all kthreads") there
have been no assumptions added that to_kthread or __to_kthread will
not return NULL.

There are a few callers of to_kthread or __to_kthread that assume a
non-NULL struct kthread pointer will be returned.  These functions are
kthread_data(), kthread_parmme(), kthread_exit(), kthread(),
kthread_park(), kthread_unpark(), kthread_stop().  All of those functions
can reasonably expected to be called when it is know that a task is a
kthread so that assumption seems reasonable.

Cc: stable@vger.kernel.org
Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads")
Reported-by: Максим Кутявин <maximkabox13@gmail.com>
Link: https://lkml.kernel.org/r/20220506141512.516114-1-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 22 ++++++++++++++++++++--
 kernel/umh.c  |  6 +++---
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..27c5203750b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2157,7 +2157,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->io_context = NULL;
 	audit_set_context(p, NULL);
 	cgroup_fork(p);
-	if (p->flags & PF_KTHREAD) {
+	if (args->kthread) {
 		if (!set_kthread_struct(p))
 			goto bad_fork_cleanup_delayacct;
 	}
@@ -2548,7 +2548,8 @@ struct task_struct * __init fork_idle(int cpu)
 {
 	struct task_struct *task;
 	struct kernel_clone_args args = {
-		.flags = CLONE_VM,
+		.flags		= CLONE_VM,
+		.kthread	= 1,
 	};
 
 	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@ -2679,6 +2680,23 @@ pid_t kernel_clone(struct kernel_clone_args *args)
  * Create a kernel thread.
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+		.kthread	= 1,
+	};
+
+	return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
 	struct kernel_clone_args args = {
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
diff --git a/kernel/umh.c b/kernel/umh.c
index 36c123360ab8..b989736e8707 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 
 	/* If SIGCLD is ignored do_wait won't populate the status. */
 	kernel_sigaction(SIGCHLD, SIG_DFL);
-	pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
+	pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
 	if (pid < 0)
 		sub_info->retval = pid;
 	else
@@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		 * want to pollute current->children, and we need a parent
 		 * that always ignores SIGCHLD to ensure auto-reaping.
 		 */
-		pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
-				    CLONE_PARENT | SIGCHLD);
+		pid = user_mode_thread(call_usermodehelper_exec_async, sub_info,
+				       CLONE_PARENT | SIGCHLD);
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
-- 
cgit v1.2.3


From c5febea0956fd3874e8fb59c6f84d68f128d68f8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Apr 2022 18:07:50 -0500
Subject: fork: Pass struct kernel_clone_args into copy_thread

With io_uring we have started supporting tasks that are for most
purposes user space tasks that exclusively run code in kernel mode.

The kernel task that exec's init and tasks that exec user mode
helpers are also user mode tasks that just run kernel code
until they call kernel execve.

Pass kernel_clone_args into copy_thread so these oddball
tasks can be supported more cleanly and easily.

v2: Fix spelling of kenrel_clone_args on h8300
Link: https://lkml.kernel.org/r/20220506141512.516114-2-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 27c5203750b4..d39a248a8d8d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1979,7 +1979,7 @@ static __latent_entropy struct task_struct *copy_process(
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
 	struct file *pidfile = NULL;
-	u64 clone_flags = args->flags;
+	const u64 clone_flags = args->flags;
 	struct nsproxy *nsp = current->nsproxy;
 
 	/*
@@ -2240,7 +2240,7 @@ static __latent_entropy struct task_struct *copy_process(
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+	retval = copy_thread(p, args);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3


From 36cb0e1cda645ee645b85a6ce652cb46a16e14e5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 11 Apr 2022 16:17:28 -0500
Subject: fork: Explicity test for idle tasks in copy_thread

The architectures ia64 and parisc have special handling for the idle
thread in copy_process.  Add a flag named idle to kernel_clone_args
and use it to explicity test if an idle process is being created.

Fullfill the expectations of the rest of the copy_thread
implemetations and pass a function pointer in .stack from fork_idle().
This makes what is happening in copy_thread better defined, and is
useful to make idle threads less special.

Link: https://lkml.kernel.org/r/20220506141512.516114-3-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d39a248a8d8d..93d77ee921ff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2544,12 +2544,21 @@ static inline void init_idle_pids(struct task_struct *idle)
 	}
 }
 
+static int idle_dummy(void *dummy)
+{
+	/* This function is never called */
+	return 0;
+}
+
 struct task_struct * __init fork_idle(int cpu)
 {
 	struct task_struct *task;
 	struct kernel_clone_args args = {
 		.flags		= CLONE_VM,
+		.stack		= (unsigned long)&idle_dummy,
+		.stack_size	= (unsigned long)NULL,
 		.kthread	= 1,
+		.idle		= 1,
 	};
 
 	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
-- 
cgit v1.2.3


From 5bd2e97c868a8a44470950ed01846cab6328e540 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 12 Apr 2022 10:18:48 -0500
Subject: fork: Generalize PF_IO_WORKER handling

Add fn and fn_arg members into struct kernel_clone_args and test for
them in copy_thread (instead of testing for PF_KTHREAD | PF_IO_WORKER).
This allows any task that wants to be a user space task that only runs
in kernel mode to use this functionality.

The code on x86 is an exception and still retains a PF_KTHREAD test
because x86 unlikely everything else handles kthreads slightly
differently than user space tasks that start with a function.

The functions that created tasks that start with a function
have been updated to set ".fn" and ".fn_arg" instead of
".stack" and ".stack_size".  These functions are fork_idle(),
create_io_thread(), kernel_thread(), and user_mode_thread().

Link: https://lkml.kernel.org/r/20220506141512.516114-4-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 93d77ee921ff..8e17c3fbce42 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2555,8 +2555,8 @@ struct task_struct * __init fork_idle(int cpu)
 	struct task_struct *task;
 	struct kernel_clone_args args = {
 		.flags		= CLONE_VM,
-		.stack		= (unsigned long)&idle_dummy,
-		.stack_size	= (unsigned long)NULL,
+		.fn		= &idle_dummy,
+		.fn_arg		= NULL,
 		.kthread	= 1,
 		.idle		= 1,
 	};
@@ -2589,8 +2589,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
 				    CLONE_UNTRACED) & ~CSIGNAL),
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
-		.stack		= (unsigned long)fn,
-		.stack_size	= (unsigned long)arg,
+		.fn		= fn,
+		.fn_arg		= arg,
 		.io_thread	= 1,
 	};
 
@@ -2694,8 +2694,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
 				    CLONE_UNTRACED) & ~CSIGNAL),
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
-		.stack		= (unsigned long)fn,
-		.stack_size	= (unsigned long)arg,
+		.fn		= fn,
+		.fn_arg		= arg,
 		.kthread	= 1,
 	};
 
@@ -2711,8 +2711,8 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
 				    CLONE_UNTRACED) & ~CSIGNAL),
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
-		.stack		= (unsigned long)fn,
-		.stack_size	= (unsigned long)arg,
+		.fn		= fn,
+		.fn_arg		= arg,
 	};
 
 	return kernel_clone(&args);
-- 
cgit v1.2.3


From 753550eb0ce1fea4b5cbd989f2e06ef80b2feb28 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 11 Apr 2022 14:13:56 -0500
Subject: fork: Explicitly set PF_KTHREAD

Instead of implicitly inheriting PF_KTHREAD from the parent process
examine arguments in kernel_clone_args to see if PF_KTHREAD should be
set.  This makes knowledge of which new threads are kernel threads
explicit.

This also makes it so that init and the user mode helper processes
no longer have PF_KTHREAD set.

Link: https://lkml.kernel.org/r/20220506141512.516114-6-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8e17c3fbce42..35645f57bd2f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2068,6 +2068,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
+	p->flags &= ~PF_KTHREAD;
+	if (args->kthread)
+		p->flags |= PF_KTHREAD;
 	if (args->io_thread) {
 		/*
 		 * Mark us an IO worker, and block any signal that isn't
-- 
cgit v1.2.3


From 1e8ca62b79dec20aeded4fe283f4890e5016767a Mon Sep 17 00:00:00 2001
From: Daniel Mentz <danielmentz@google.com>
Date: Wed, 13 Apr 2022 16:36:49 -0700
Subject: kheaders: Have cpio unconditionally replace files

For out-of-tree builds, this script invokes cpio twice to copy header
files from the srctree and subsequently from the objtree. According to a
comment in the script, there might be situations in which certain files
already exist in the destination directory when header files are copied
from the objtree:

"The second CPIO can complain if files already exist which can happen
with out of tree builds having stale headers in srctree. Just silence
CPIO for now."

GNU cpio might simply print a warning like "newer or same age version
exists", but toybox cpio exits with a non-zero exit code unless the
command line option "-u" is specified.

To improve compatibility with toybox cpio, add the command line option
"-u" to unconditionally replace existing files in the destination
directory.

Signed-off-by: Daniel Mentz <danielmentz@google.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 kernel/gen_kheaders.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 1966a749e0d9..0c78e64f747d 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -74,7 +74,7 @@ fi
 # of tree builds having stale headers in srctree. Just silence CPIO for now.
 for f in $dir_list;
 	do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1
 
 # Remove comments except SDPX lines
 find $cpio_dir -type f -print0 |
-- 
cgit v1.2.3


From 2e5920bb073a4e3e69cf8e581836cafc8ba1b464 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Fri, 6 May 2022 19:43:57 +0800
Subject: kdump: return -ENOENT if required cmdline option does not exist

According to the current crashkernel=Y,low support in other ARCHes, it's
an optional command-line option. When it doesn't exist, kernel will try
to allocate minimum required memory below 4G automatically.

However, __parse_crashkernel() returns '-EINVAL' for all error cases. It
can't distinguish the nonexistent option from invalid option.

Change __parse_crashkernel() to return '-ENOENT' for the nonexistent option
case. With this change, crashkernel,low memory will take the default
value if crashkernel=,low is not specified; while crashkernel reservation
will fail and bail out if an invalid option is specified.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Acked-by: Baoquan He <bhe@redhat.com>
Link: https://lore.kernel.org/r/20220506114402.365-2-thunder.leizhen@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/crash_core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 256cf6db573c..4d57c03714f4 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -243,9 +243,8 @@ static int __init __parse_crashkernel(char *cmdline,
 	*crash_base = 0;
 
 	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
 	if (!ck_cmdline)
-		return -EINVAL;
+		return -ENOENT;
 
 	ck_cmdline += strlen(name);
 
-- 
cgit v1.2.3


From 595b893e2087de306d0781795fb8ec47873596a6 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 3 May 2022 13:55:00 -0700
Subject: randstruct: Reorganize Kconfigs and attribute macros

In preparation for Clang supporting randstruct, reorganize the Kconfigs,
move the attribute macros, and generalize the feature to be named
CONFIG_RANDSTRUCT for on/off, CONFIG_RANDSTRUCT_FULL for the full
randomization mode, and CONFIG_RANDSTRUCT_PERFORMANCE for the cache-line
sized mode.

Cc: linux-hardening@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220503205503.3054173-4-keescook@chromium.org
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index eb4dfb932c85..8355b19676f8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -48,7 +48,7 @@ unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
 
 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask =
-	IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
+	IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-- 
cgit v1.2.3


From a12685e2d1f7eed58ee408ba375606577b59610c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:17 +0100
Subject: stackleak: move skip_erasing() check earlier

In stackleak_erase() we check skip_erasing() after accessing some fields
from current. As generating the address of current uses asm which
hazards with the static branch asm, this work is always performed, even
when the static branch is patched to jump to the return at the end of the
function.

This patch avoids this redundant work by moving the skip_erasing() check
earlier.

To avoid complicating initialization within stackleak_erase(), the body
of the function is split out into a __stackleak_erase() helper, with the
check left in a wrapper function. The __stackleak_erase() helper is
marked __always_inline to ensure that this is inlined into
stackleak_erase() and not instrumented.

Before this patch, on x86-64 w/ GCC 11.1.0 the start of the function is:

<stackleak_erase>:
   65 48 8b 04 25 00 00    mov    %gs:0x0,%rax
   00 00
   48 8b 48 20             mov    0x20(%rax),%rcx
   48 8b 80 98 0a 00 00    mov    0xa98(%rax),%rax
   66 90                   xchg   %ax,%ax  <------------ static branch
   48 89 c2                mov    %rax,%rdx
   48 29 ca                sub    %rcx,%rdx
   48 81 fa ff 3f 00 00    cmp    $0x3fff,%rdx

After this patch, on x86-64 w/ GCC 11.1.0 the start of the function is:

<stackleak_erase>:
   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)  <--- static branch
   65 48 8b 04 25 00 00    mov    %gs:0x0,%rax
   00 00
   48 8b 48 20             mov    0x20(%rax),%rcx
   48 8b 80 98 0a 00 00    mov    0xa98(%rax),%rax
   48 89 c2                mov    %rax,%rdx
   48 29 ca                sub    %rcx,%rdx
   48 81 fa ff 3f 00 00    cmp    $0x3fff,%rdx

Before this patch, on arm64 w/ GCC 11.1.0 the start of the function is:

<stackleak_erase>:
   d503245f        bti     c
   d5384100        mrs     x0, sp_el0
   f9401003        ldr     x3, [x0, #32]
   f9451000        ldr     x0, [x0, #2592]
   d503201f        nop  <------------------------------- static branch
   d503233f        paciasp
   cb030002        sub     x2, x0, x3
   d287ffe1        mov     x1, #0x3fff
   eb01005f        cmp     x2, x1

After this patch, on arm64 w/ GCC 11.1.0 the start of the function is:

<stackleak_erase>:
   d503245f        bti     c
   d503201f        nop  <------------------------------- static branch
   d503233f        paciasp
   d5384100        mrs     x0, sp_el0
   f9401003        ldr     x3, [x0, #32]
   d287ffe1        mov     x1, #0x3fff
   f9451000        ldr     x0, [x0, #2592]
   cb030002        sub     x2, x0, x3
   eb01005f        cmp     x2, x1

While this may not be a huge win on its own, moving the static branch
will permit further optimization of the body of the function in
subsequent patches.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-3-mark.rutland@arm.com
---
 kernel/stackleak.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index ddb5a7f48d69..753eab797a04 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init);
 #define skip_erasing()	false
 #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
 
-asmlinkage void noinstr stackleak_erase(void)
+static __always_inline void __stackleak_erase(void)
 {
 	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
 	unsigned long kstack_ptr = current->lowest_stack;
@@ -78,9 +78,6 @@ asmlinkage void noinstr stackleak_erase(void)
 	unsigned int poison_count = 0;
 	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
 
-	if (skip_erasing())
-		return;
-
 	/* Check that 'lowest_stack' value is sane */
 	if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
 		kstack_ptr = boundary;
@@ -125,6 +122,14 @@ asmlinkage void noinstr stackleak_erase(void)
 	current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
 }
 
+asmlinkage void noinstr stackleak_erase(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase();
+}
+
 void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
 {
 	unsigned long sp = current_stack_pointer;
-- 
cgit v1.2.3


From ac7838b4e1c552d54e67f78a29bc1bd7701c13e8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:18 +0100
Subject: stackleak: remove redundant check

In __stackleak_erase() we check that the `erase_low` value derived from
`current->lowest_stack` is above the lowest legitimate stack pointer
value, but this is already enforced by stackleak_track_stack() when
recording the lowest stack value.

Remove the redundant check.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-4-mark.rutland@arm.com
---
 kernel/stackleak.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 753eab797a04..f7a0f8cf73c3 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -78,10 +78,6 @@ static __always_inline void __stackleak_erase(void)
 	unsigned int poison_count = 0;
 	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
 
-	/* Check that 'lowest_stack' value is sane */
-	if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
-		kstack_ptr = boundary;
-
 	/* Search for the poison value in the kernel stack */
 	while (kstack_ptr > boundary && poison_count <= depth) {
 		if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
-- 
cgit v1.2.3


From 9ec79840d6afaf472294588a6bbe145bcdffa28b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:19 +0100
Subject: stackleak: rework stack low bound handling

In stackleak_task_init(), stackleak_track_stack(), and
__stackleak_erase(), we open-code skipping the STACK_END_MAGIC at the
bottom of the stack. Each case is implemented slightly differently, and
only the __stackleak_erase() case is commented.

In stackleak_task_init() and stackleak_track_stack() we unconditionally
add sizeof(unsigned long) to the lowest stack address. In
stackleak_task_init() we use end_of_stack() for this, and in
stackleak_track_stack() we use task_stack_page(). In __stackleak_erase()
we handle this by detecting if `kstack_ptr` has hit the stack end
boundary, and if so, conditionally moving it above the magic.

This patch adds a new stackleak_task_low_bound() helper which is used in
all three cases, which unconditionally adds sizeof(unsigned long) to the
lowest address on the task stack, with commentary as to why. This uses
end_of_stack() as stackleak_task_init() did prior to this patch, as this
is consistent with the code in kernel/fork.c which initializes the
STACK_END_MAGIC value.

In __stackleak_erase() we no longer need to check whether we've spilled
into the STACK_END_MAGIC value, as stackleak_track_stack() ensures that
`current->lowest_stack` stops immediately above this, and similarly the
poison scan will stop immediately above this.

For stackleak_task_init() and stackleak_track_stack() this results in no
change to code generation. For __stackleak_erase() the generated
assembly is slightly simpler and shorter.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-5-mark.rutland@arm.com
---
 kernel/stackleak.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index f7a0f8cf73c3..24b7cf01b297 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -72,9 +72,11 @@ late_initcall(stackleak_sysctls_init);
 
 static __always_inline void __stackleak_erase(void)
 {
+	const unsigned long task_stack_low = stackleak_task_low_bound(current);
+
 	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
 	unsigned long kstack_ptr = current->lowest_stack;
-	unsigned long boundary = (unsigned long)end_of_stack(current);
+	unsigned long boundary = task_stack_low;
 	unsigned int poison_count = 0;
 	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
 
@@ -88,13 +90,6 @@ static __always_inline void __stackleak_erase(void)
 		kstack_ptr -= sizeof(unsigned long);
 	}
 
-	/*
-	 * One 'long int' at the bottom of the thread stack is reserved and
-	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
-	 */
-	if (kstack_ptr == boundary)
-		kstack_ptr += sizeof(unsigned long);
-
 #ifdef CONFIG_STACKLEAK_METRICS
 	current->prev_lowest_stack = kstack_ptr;
 #endif
@@ -140,8 +135,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
 	/* 'lowest_stack' should be aligned on the register width boundary */
 	sp = ALIGN(sp, sizeof(unsigned long));
 	if (sp < current->lowest_stack &&
-	    sp >= (unsigned long)task_stack_page(current) +
-						sizeof(unsigned long)) {
+	    sp >= stackleak_task_low_bound(current)) {
 		current->lowest_stack = sp;
 	}
 }
-- 
cgit v1.2.3


From 1723d39d2fe49b8a75939060c95e7e908b463793 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:20 +0100
Subject: stackleak: clarify variable names

The logic within __stackleak_erase() can be a little hard to follow, as
`boundary` switches from being the low bound to the high bound mid way
through the function, and `kstack_ptr` is used to represent the start of
the region to erase while `boundary` represents the end of the region to
erase.

Make this a little clearer by consistently using clearer variable names.
The `boundary` variable is removed, the bounds of the region to erase
are described by `erase_low` and `erase_high`, and bounds of the task
stack are described by `task_stack_low` and `task_stack_high`.

As the same time, remove the comment above the variables, since it is
unclear whether it's intended as rationale, a complaint, or a TODO, and
is more confusing than helpful.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-6-mark.rutland@arm.com
---
 kernel/stackleak.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 24b7cf01b297..d5f684dc0a2d 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -73,40 +73,38 @@ late_initcall(stackleak_sysctls_init);
 static __always_inline void __stackleak_erase(void)
 {
 	const unsigned long task_stack_low = stackleak_task_low_bound(current);
-
-	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
-	unsigned long kstack_ptr = current->lowest_stack;
-	unsigned long boundary = task_stack_low;
+	unsigned long erase_low = current->lowest_stack;
+	unsigned long erase_high;
 	unsigned int poison_count = 0;
 	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
 
 	/* Search for the poison value in the kernel stack */
-	while (kstack_ptr > boundary && poison_count <= depth) {
-		if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
+	while (erase_low > task_stack_low && poison_count <= depth) {
+		if (*(unsigned long *)erase_low == STACKLEAK_POISON)
 			poison_count++;
 		else
 			poison_count = 0;
 
-		kstack_ptr -= sizeof(unsigned long);
+		erase_low -= sizeof(unsigned long);
 	}
 
 #ifdef CONFIG_STACKLEAK_METRICS
-	current->prev_lowest_stack = kstack_ptr;
+	current->prev_lowest_stack = erase_low;
 #endif
 
 	/*
-	 * Now write the poison value to the kernel stack. Start from
-	 * 'kstack_ptr' and move up till the new 'boundary'. We assume that
-	 * the stack pointer doesn't change when we write poison.
+	 * Now write the poison value to the kernel stack between 'erase_low'
+	 * and 'erase_high'. We assume that the stack pointer doesn't change
+	 * when we write poison.
 	 */
 	if (on_thread_stack())
-		boundary = current_stack_pointer;
+		erase_high = current_stack_pointer;
 	else
-		boundary = current_top_of_stack();
+		erase_high = current_top_of_stack();
 
-	while (kstack_ptr < boundary) {
-		*(unsigned long *)kstack_ptr = STACKLEAK_POISON;
-		kstack_ptr += sizeof(unsigned long);
+	while (erase_low < erase_high) {
+		*(unsigned long *)erase_low = STACKLEAK_POISON;
+		erase_low += sizeof(unsigned long);
 	}
 
 	/* Reset the 'lowest_stack' value for the next syscall */
-- 
cgit v1.2.3


From 0cfa2ccd285d98ad62218add2eebdcfff69fb2c0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:21 +0100
Subject: stackleak: rework stack high bound handling

Prior to returning to userspace, we reset current->lowest_stack to a
reasonable high bound. Currently we do this by subtracting the arbitrary
value `THREAD_SIZE/64` from the top of the stack, for reasons lost to
history.

Looking at configurations today:

* On i386 where THREAD_SIZE is 8K, the bound will be 128 bytes. The
  pt_regs at the top of the stack is 68 bytes (with 0 to 16 bytes of
  padding above), and so this covers an additional portion of 44 to 60
  bytes.

* On x86_64 where THREAD_SIZE is at least 16K (up to 32K with KASAN) the
  bound will be at least 256 bytes (up to 512 with KASAN). The pt_regs
  at the top of the stack is 168 bytes, and so this cover an additional
  88 bytes of stack (up to 344 with KASAN).

* On arm64 where THREAD_SIZE is at least 16K (up to 64K with 64K pages
  and VMAP_STACK), the bound will be at least 256 bytes (up to 1024 with
  KASAN). The pt_regs at the top of the stack is 336 bytes, so this can
  fall within the pt_regs, or can cover an additional 688 bytes of
  stack.

Clearly the `THREAD_SIZE/64` value doesn't make much sense -- in the
worst case, this will cause more than 600 bytes of stack to be erased
for every syscall, even if actual stack usage were substantially
smaller.

This patches makes this slightly less nonsensical by consistently
resetting current->lowest_stack to the base of the task pt_regs. For
clarity and for consistency with the handling of the low bound, the
generation of the high bound is split into a helper with commentary
explaining why.

Since the pt_regs at the top of the stack will be clobbered upon the
next exception entry, we don't need to poison these at exception exit.
By using task_pt_regs() as the high stack boundary instead of
current_top_of_stack() we avoid some redundant poisoning, and the
compiler can share the address generation between the poisoning and
resetting of `current->lowest_stack`, making the generated code more
optimal.

It's not clear to me whether the existing `THREAD_SIZE/64` offset was a
dodgy heuristic to skip the pt_regs, or whether it was attempting to
minimize the number of times stackleak_check_stack() would have to
update `current->lowest_stack` when stack usage was shallow at the cost
of unconditionally poisoning a small portion of the stack for every exit
to userspace.

For now I've simply removed the offset, and if we need/want to minimize
updates for shallow stack usage it should be easy to add a better
heuristic atop, with appropriate commentary so we know what's going on.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-7-mark.rutland@arm.com
---
 kernel/stackleak.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index d5f684dc0a2d..ba346d46218f 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -73,6 +73,7 @@ late_initcall(stackleak_sysctls_init);
 static __always_inline void __stackleak_erase(void)
 {
 	const unsigned long task_stack_low = stackleak_task_low_bound(current);
+	const unsigned long task_stack_high = stackleak_task_high_bound(current);
 	unsigned long erase_low = current->lowest_stack;
 	unsigned long erase_high;
 	unsigned int poison_count = 0;
@@ -93,14 +94,22 @@ static __always_inline void __stackleak_erase(void)
 #endif
 
 	/*
-	 * Now write the poison value to the kernel stack between 'erase_low'
-	 * and 'erase_high'. We assume that the stack pointer doesn't change
-	 * when we write poison.
+	 * Write poison to the task's stack between 'erase_low' and
+	 * 'erase_high'.
+	 *
+	 * If we're running on a different stack (e.g. an entry trampoline
+	 * stack) we can erase everything below the pt_regs at the top of the
+	 * task stack.
+	 *
+	 * If we're running on the task stack itself, we must not clobber any
+	 * stack used by this function and its caller. We assume that this
+	 * function has a fixed-size stack frame, and the current stack pointer
+	 * doesn't change while we write poison.
 	 */
 	if (on_thread_stack())
 		erase_high = current_stack_pointer;
 	else
-		erase_high = current_top_of_stack();
+		erase_high = task_stack_high;
 
 	while (erase_low < erase_high) {
 		*(unsigned long *)erase_low = STACKLEAK_POISON;
@@ -108,7 +117,7 @@ static __always_inline void __stackleak_erase(void)
 	}
 
 	/* Reset the 'lowest_stack' value for the next syscall */
-	current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
+	current->lowest_stack = task_stack_high;
 }
 
 asmlinkage void noinstr stackleak_erase(void)
-- 
cgit v1.2.3


From 77cf2b6dee6680536a3109d894f1b1ccda3fc5bf Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:22 +0100
Subject: stackleak: rework poison scanning

Currently we over-estimate the region of stack which must be erased.

To determine the region to be erased, we scan downwards for a contiguous
block of poison values (or the low bound of the stack). There are a few
minor problems with this today:

* When we find a block of poison values, we include this block within
  the region to erase.

  As this is included within the region to erase, this causes us to
  redundantly overwrite 'STACKLEAK_SEARCH_DEPTH' (128) bytes with
  poison.

* As the loop condition checks 'poison_count <= depth', it will run an
  additional iteration after finding the contiguous block of poison,
  decrementing 'erase_low' once more than necessary.

  As this is included within the region to erase, this causes us to
  redundantly overwrite an additional unsigned long with poison.

* As we always decrement 'erase_low' after checking an element on the
  stack, we always include the element below this within the region to
  erase.

  As this is included within the region to erase, this causes us to
  redundantly overwrite an additional unsigned long with poison.

  Note that this is not a functional problem. As the loop condition
  checks 'erase_low > task_stack_low', we'll never clobber the
  STACK_END_MAGIC. As we always decrement 'erase_low' after this, we'll
  never fail to erase the element immediately above the STACK_END_MAGIC.

In total, this can cause us to erase `128 + 2 * sizeof(unsigned long)`
bytes more than necessary, which is unfortunate.

This patch reworks the logic to find the address immediately above the
poisoned region, by finding the lowest non-poisoned address. This is
factored into a stackleak_find_top_of_poison() helper both for clarity
and so that this can be shared with the LKDTM test in subsequent
patches.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-8-mark.rutland@arm.com
---
 kernel/stackleak.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index ba346d46218f..afd54b8e10b8 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -74,20 +74,10 @@ static __always_inline void __stackleak_erase(void)
 {
 	const unsigned long task_stack_low = stackleak_task_low_bound(current);
 	const unsigned long task_stack_high = stackleak_task_high_bound(current);
-	unsigned long erase_low = current->lowest_stack;
-	unsigned long erase_high;
-	unsigned int poison_count = 0;
-	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
-
-	/* Search for the poison value in the kernel stack */
-	while (erase_low > task_stack_low && poison_count <= depth) {
-		if (*(unsigned long *)erase_low == STACKLEAK_POISON)
-			poison_count++;
-		else
-			poison_count = 0;
-
-		erase_low -= sizeof(unsigned long);
-	}
+	unsigned long erase_low, erase_high;
+
+	erase_low = stackleak_find_top_of_poison(task_stack_low,
+						 current->lowest_stack);
 
 #ifdef CONFIG_STACKLEAK_METRICS
 	current->prev_lowest_stack = erase_low;
-- 
cgit v1.2.3


From 8111e67dee9ff774712cff8e34fba465c8361960 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 27 Apr 2022 18:31:27 +0100
Subject: stackleak: add on/off stack variants

The stackleak_erase() code dynamically handles being on a task stack or
another stack. In most cases, this is a fixed property of the caller,
which the caller is aware of, as an architecture might always return
using the task stack, or might always return using a trampoline stack.

This patch adds stackleak_erase_on_task_stack() and
stackleak_erase_off_task_stack() functions which callers can use to
avoid on_thread_stack() check and associated redundant work when the
calling stack is known. The existing stackleak_erase() is retained as a
safe default.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220427173128.2603085-13-mark.rutland@arm.com
---
 kernel/stackleak.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index afd54b8e10b8..c2c33d2202e9 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init);
 #define skip_erasing()	false
 #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
 
-static __always_inline void __stackleak_erase(void)
+static __always_inline void __stackleak_erase(bool on_task_stack)
 {
 	const unsigned long task_stack_low = stackleak_task_low_bound(current);
 	const unsigned long task_stack_high = stackleak_task_high_bound(current);
@@ -96,7 +96,7 @@ static __always_inline void __stackleak_erase(void)
 	 * function has a fixed-size stack frame, and the current stack pointer
 	 * doesn't change while we write poison.
 	 */
-	if (on_thread_stack())
+	if (on_task_stack)
 		erase_high = current_stack_pointer;
 	else
 		erase_high = task_stack_high;
@@ -110,12 +110,41 @@ static __always_inline void __stackleak_erase(void)
 	current->lowest_stack = task_stack_high;
 }
 
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
 asmlinkage void noinstr stackleak_erase(void)
 {
 	if (skip_erasing())
 		return;
 
-	__stackleak_erase();
+	__stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase(false);
 }
 
 void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
-- 
cgit v1.2.3


From 6d97af487dee3176cb1342d4ab16637e495440ad Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Wed, 4 May 2022 08:23:50 +0200
Subject: entry: Rename arch_check_user_regs() to arch_enter_from_user_mode()

arch_check_user_regs() is used at the moment to verify that struct pt_regs
contains valid values when entering the kernel from userspace. s390 needs
a place in the generic entry code to modify a cpu data structure when
switching from userspace to kernel mode. As arch_check_user_regs() is
exactly this, rename it to arch_enter_from_user_mode().

When entering the kernel from userspace, arch_check_user_regs() is
used to verify that struct pt_regs contains valid values. Note that
the NMI codepath doesn't call this function. s390 needs a place in the
generic entry code to modify a cpu data structure when switching from
userspace to kernel mode. As arch_check_user_regs() is exactly this,
rename it to arch_enter_from_user_mode().

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Link: https://lore.kernel.org/r/20220504062351.2954280-2-tmricht@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 kernel/entry/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 93c3b86e781c..9e63923c5a0f 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -17,7 +17,7 @@
 /* See comment for enter_from_user_mode() in entry-common.h */
 static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 {
-	arch_check_user_regs(regs);
+	arch_enter_from_user_mode(regs);
 	lockdep_hardirqs_off(CALLER_ADDR0);
 
 	CT_WARN_ON(ct_state() != CONTEXT_USER);
-- 
cgit v1.2.3


From 5efe7448a1426250b5747c10ad438517f44f1e51 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Apr 2022 08:43:23 -0400
Subject: fs: Introduce aops->read_folio

Change all the callers of ->readpage to call ->read_folio in preference,
if it exists.  This is a transitional duplication, and will be removed
by the end of the series.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 kernel/events/uprobes.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6418083901d4..2c7815d20038 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -790,7 +790,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	 * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
 	 * see uprobe_register().
 	 */
-	if (mapping->a_ops->readpage)
+	if (mapping->a_ops->read_folio || mapping->a_ops->readpage)
 		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
 	else
 		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1143,7 +1143,9 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
 		return -EINVAL;
 
 	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
-	if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+	if (!inode->i_mapping->a_ops->read_folio &&
+	    !inode->i_mapping->a_ops->readpage &&
+	    !shmem_mapping(inode->i_mapping))
 		return -EIO;
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))
-- 
cgit v1.2.3


From 7e0a126519b82648b254afcd95a168c15f65ea40 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Apr 2022 11:53:28 -0400
Subject: mm,fs: Remove aops->readpage

With all implementations of aops->readpage converted to aops->read_folio,
we can stop checking whether it's set and remove the member from aops.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 kernel/events/uprobes.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2c7815d20038..a9bc3c98f76a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	struct page *page;
 	/*
 	 * Ensure that the page that has the original instruction is populated
-	 * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+	 * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
 	 * see uprobe_register().
 	 */
-	if (mapping->a_ops->read_folio || mapping->a_ops->readpage)
+	if (mapping->a_ops->read_folio)
 		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
 	else
 		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
@@ -1144,7 +1144,6 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
 
 	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
 	if (!inode->i_mapping->a_ops->read_folio &&
-	    !inode->i_mapping->a_ops->readpage &&
 	    !shmem_mapping(inode->i_mapping))
 		return -EIO;
 	/* Racy, just to catch the obvious mistakes */
-- 
cgit v1.2.3


From 764aaf44cd64dd1f760268ee0b22d2dc53cd5bc0 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 27 Apr 2022 20:54:01 +0800
Subject: reboot: Fix build warning without CONFIG_SYSCTL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_SYSCTL is n, build warn:

kernel/reboot.c:443:20: error: ‘kernel_reboot_sysctls_init’ defined but not used [-Werror=unused-function]
 static void __init kernel_reboot_sysctls_init(void)
                    ^~~~~~~~~~~~~~~~~~~~~~~~~~

Move kernel_reboot_sysctls_init() to #ifdef block to fix this.

Fixes: 06d177662fb8 ("kernel/reboot: move reboot sysctls to its own file")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/reboot.c | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index ed4e6dfb7d44..ecbf09ea03c5 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -421,33 +421,6 @@ void ctrl_alt_del(void)
 static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
 static const char reboot_cmd[] = "/sbin/reboot";
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_reboot_table[] = {
-	{
-		.procname       = "poweroff_cmd",
-		.data           = &poweroff_cmd,
-		.maxlen         = POWEROFF_CMD_PATH_LEN,
-		.mode           = 0644,
-		.proc_handler   = proc_dostring,
-	},
-	{
-		.procname       = "ctrl-alt-del",
-		.data           = &C_A_D,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
-	{ }
-};
-
-static void __init kernel_reboot_sysctls_init(void)
-{
-	register_sysctl_init("kernel", kern_reboot_table);
-}
-#else
-#define kernel_reboot_sysctls_init() do { } while (0)
-#endif /* CONFIG_SYSCTL */
-
 static int run_cmd(const char *cmd)
 {
 	char **argv;
@@ -895,6 +868,33 @@ static struct attribute *reboot_attrs[] = {
 	NULL,
 };
 
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_reboot_table[] = {
+	{
+		.procname       = "poweroff_cmd",
+		.data           = &poweroff_cmd,
+		.maxlen         = POWEROFF_CMD_PATH_LEN,
+		.mode           = 0644,
+		.proc_handler   = proc_dostring,
+	},
+	{
+		.procname       = "ctrl-alt-del",
+		.data           = &C_A_D,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+	register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
 static const struct attribute_group reboot_attr_group = {
 	.attrs = reboot_attrs,
 };
-- 
cgit v1.2.3


From 494dcdf46e5cdee926c9f441d37e3ea1db57d1da Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 27 Apr 2022 21:10:02 +0800
Subject: sched: Fix build warning without CONFIG_SYSCTL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

IF CONFIG_SYSCTL is n, build warn:

kernel/sched/core.c:1782:12: warning: ‘sysctl_sched_uclamp_handler’ defined but not used [-Wunused-function]
 static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~

sysctl_sched_uclamp_handler() is used while CONFIG_SYSCTL enabled,
wrap all related code with CONFIG_SYSCTL to fix this.

Fixes: 3267e0156c33 ("sched: Move uclamp_util sysctls to core.c")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/sched/core.c | 65 +++++++++++++++++++++++++++++------------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef31751c5799..b8d2ff09b853 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1306,10 +1306,10 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 static DEFINE_MUTEX(uclamp_mutex);
 
 /* Max allowed minimum utilization */
-static unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
 
 /* Max allowed maximum utilization */
-static unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
 
 /*
  * By default RT tasks run at the maximum performance point/capacity of the
@@ -1456,33 +1456,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p)
 	task_rq_unlock(rq, p, &rf);
 }
 
-static void uclamp_sync_util_min_rt_default(void)
-{
-	struct task_struct *g, *p;
-
-	/*
-	 * copy_process()			sysctl_uclamp
-	 *					  uclamp_min_rt = X;
-	 *   write_lock(&tasklist_lock)		  read_lock(&tasklist_lock)
-	 *   // link thread			  smp_mb__after_spinlock()
-	 *   write_unlock(&tasklist_lock)	  read_unlock(&tasklist_lock);
-	 *   sched_post_fork()			  for_each_process_thread()
-	 *     __uclamp_sync_rt()		    __uclamp_sync_rt()
-	 *
-	 * Ensures that either sched_post_fork() will observe the new
-	 * uclamp_min_rt or for_each_process_thread() will observe the new
-	 * task.
-	 */
-	read_lock(&tasklist_lock);
-	smp_mb__after_spinlock();
-	read_unlock(&tasklist_lock);
-
-	rcu_read_lock();
-	for_each_process_thread(g, p)
-		uclamp_update_util_min_rt_default(p);
-	rcu_read_unlock();
-}
-
 static inline struct uclamp_se
 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 {
@@ -1762,6 +1735,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css)
 }
 
 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_UCLAMP_TASK_GROUP
 static void uclamp_update_root_tg(void)
 {
 	struct task_group *tg = &root_task_group;
@@ -1779,6 +1757,33 @@ static void uclamp_update_root_tg(void)
 static void uclamp_update_root_tg(void) { }
 #endif
 
+static void uclamp_sync_util_min_rt_default(void)
+{
+	struct task_struct *g, *p;
+
+	/*
+	 * copy_process()			sysctl_uclamp
+	 *					  uclamp_min_rt = X;
+	 *   write_lock(&tasklist_lock)		  read_lock(&tasklist_lock)
+	 *   // link thread			  smp_mb__after_spinlock()
+	 *   write_unlock(&tasklist_lock)	  read_unlock(&tasklist_lock);
+	 *   sched_post_fork()			  for_each_process_thread()
+	 *     __uclamp_sync_rt()		    __uclamp_sync_rt()
+	 *
+	 * Ensures that either sched_post_fork() will observe the new
+	 * uclamp_min_rt or for_each_process_thread() will observe the new
+	 * task.
+	 */
+	read_lock(&tasklist_lock);
+	smp_mb__after_spinlock();
+	read_unlock(&tasklist_lock);
+
+	rcu_read_lock();
+	for_each_process_thread(g, p)
+		uclamp_update_util_min_rt_default(p);
+	rcu_read_unlock();
+}
+
 static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				void *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -1843,6 +1848,8 @@ done:
 
 	return result;
 }
+#endif
+#endif
 
 static int uclamp_validate(struct task_struct *p,
 			   const struct sched_attr *attr)
-- 
cgit v1.2.3


From 43bf087848ab796fab93c9b4de59a7ed70aab94a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Thu, 5 May 2022 15:01:14 +0800
Subject: bpf: Remove unused parameter from find_kfunc_desc_btf()

The func_id parameter in find_kfunc_desc_btf() is not used, get rid of it.

Fixes: 2357672c54c3 ("bpf: Introduce BPF support for kernel module function calls")
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20220505070114.3522522-1-ytcoode@gmail.com
---
 kernel/bpf/verifier.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 813f6ee80419..c27fee73a2cb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1815,8 +1815,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
 	kfree(tab);
 }
 
-static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
-				       u32 func_id, s16 offset)
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
 {
 	if (offset) {
 		if (offset < 0) {
@@ -1891,7 +1890,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 		prog_aux->kfunc_btf_tab = btf_tab;
 	}
 
-	desc_btf = find_kfunc_desc_btf(env, func_id, offset);
+	desc_btf = find_kfunc_desc_btf(env, offset);
 	if (IS_ERR(desc_btf)) {
 		verbose(env, "failed to find BTF for kernel function\n");
 		return PTR_ERR(desc_btf);
@@ -2360,7 +2359,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
 	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
 		return NULL;
 
-	desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
+	desc_btf = find_kfunc_desc_btf(data, insn->off);
 	if (IS_ERR(desc_btf))
 		return "<error>";
 
@@ -7237,7 +7236,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (!insn->imm)
 		return 0;
 
-	desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
+	desc_btf = find_kfunc_desc_btf(env, insn->off);
 	if (IS_ERR(desc_btf))
 		return PTR_ERR(desc_btf);
 
-- 
cgit v1.2.3


From 40f2bbf71161fa9195c7869004290003af152375 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 9 May 2022 18:20:43 -0700
Subject: mm/rmap: drop "compound" parameter from page_add_new_anon_rmap()

New anonymous pages are always mapped natively: only THP/khugepaged code
maps a new compound anonymous page and passes "true".  Otherwise, we're
just dealing with simple, non-compound pages.

Let's give the interface clearer semantics and document these.  Remove the
PageTransCompound() sanity check from page_add_new_anon_rmap().

Link: https://lkml.kernel.org/r/20220428083441.37290-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Liang Zhang <zhangliang5@huawei.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Oded Gabbay <oded.gabbay@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6418083901d4..4ef5385815d3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	if (new_page) {
 		get_page(new_page);
-		page_add_new_anon_rmap(new_page, vma, addr, false);
+		page_add_new_anon_rmap(new_page, vma, addr);
 		lru_cache_add_inactive_or_unevictable(new_page, vma);
 	} else
 		/* no new page, just dec_mm_counter for old_page */
-- 
cgit v1.2.3


From 9263dddc7b6f816fdd327eee435cc54ba51dd095 Mon Sep 17 00:00:00 2001
From: Takshak Chahande <ctakshak@fb.com>
Date: Tue, 10 May 2022 01:22:20 -0700
Subject: bpf: Extend batch operations for map-in-map bpf-maps

This patch extends batch operations support for map-in-map map-types:
BPF_MAP_TYPE_HASH_OF_MAPS and BPF_MAP_TYPE_ARRAY_OF_MAPS

A usecase where outer HASH map holds hundred of VIP entries and its
associated reuse-ports per VIP stored in REUSEPORT_SOCKARRAY type
inner map, needs to do batch operation for performance gain.

This patch leverages the exiting generic functions for most of the batch
operations. As map-in-map's value contains the actual reference of the inner map,
for BPF_MAP_TYPE_HASH_OF_MAPS type, it needed an extra step to fetch the
map_id from the reference value.

selftests are added in next patch 2/2.

Signed-off-by: Takshak Chahande <ctakshak@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220510082221.2390540-1-ctakshak@fb.com
---
 kernel/bpf/arraymap.c |  2 ++
 kernel/bpf/hashtab.c  | 13 +++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b3bf31fd9458..724613da6576 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1345,6 +1345,8 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = array_of_map_gen_lookup,
+	.map_lookup_batch = generic_map_lookup_batch,
+	.map_update_batch = generic_map_update_batch,
 	.map_check_btf = map_check_no_btf,
 	.map_btf_id = &array_map_btf_ids[0],
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3e00e62b2218..705841279d16 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -140,7 +140,7 @@ static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
 
 static void htab_init_buckets(struct bpf_htab *htab)
 {
-	unsigned i;
+	unsigned int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
@@ -1627,7 +1627,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
 	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
 	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
 	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
-	u32 batch, max_count, size, bucket_size;
+	u32 batch, max_count, size, bucket_size, map_id;
 	struct htab_elem *node_to_free = NULL;
 	u64 elem_map_flags, map_flags;
 	struct hlist_nulls_head *head;
@@ -1752,6 +1752,14 @@ again_nocopy:
 			}
 		} else {
 			value = l->key + roundup_key_size;
+			if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+				struct bpf_map **inner_map = value;
+
+				 /* Actual value is the id of the inner map */
+				map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+				value = &map_id;
+			}
+
 			if (elem_map_flags & BPF_F_LOCK)
 				copy_map_value_locked(map, dst_val, value,
 						      true);
@@ -2450,5 +2458,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 	.map_gen_lookup = htab_of_map_gen_lookup,
 	.map_check_btf = map_check_no_btf,
+	BATCH_OPS(htab),
 	.map_btf_id = &htab_map_btf_ids[0],
 };
-- 
cgit v1.2.3


From 9f88361273082825d9f0d13a543d49f9fa0d44a8 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Tue, 10 May 2022 17:52:30 +0200
Subject: bpf: Add bpf_link iterator

Implement bpf_link iterator to traverse links via bpf_seq_file
operations. The changeset is mostly shamelessly copied from
commit a228a64fc1e4 ("bpf: Add bpf_prog iterator")

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220510155233.9815-2-9erthalion6@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/link_iter.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c   |  19 +++++++++
 3 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/link_iter.c

(limited to 'kernel')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c1a9be6a4b9f..057ba8e01e70 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..fec8005a121c
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+	u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_link_info *info = seq->private;
+	struct bpf_link *link;
+
+	link = bpf_link_get_curr_or_next(&info->link_id);
+	if (!link)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_link_info *info = seq->private;
+
+	++*pos;
+	++info->link_id;
+	bpf_link_put((struct bpf_link *)v);
+	return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter__bpf_link ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.link = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_link_seq_show(seq, v, true);
+	else
+		bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+	.start	= bpf_link_seq_start,
+	.next	= bpf_link_seq_next,
+	.stop	= bpf_link_seq_stop,
+	.show	= bpf_link_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_link_id)
+BTF_ID(struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+	.seq_ops		= &bpf_link_seq_ops,
+	.init_seq_private	= NULL,
+	.fini_seq_private	= NULL,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+	.target			= "bpf_link",
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_link, link),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+	.seq_info		= &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+	bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+	return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e0aead17dff4..50164d324eaf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4680,6 +4680,25 @@ struct bpf_link *bpf_link_by_id(u32 id)
 	return link;
 }
 
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+	struct bpf_link *link;
+
+	spin_lock_bh(&link_idr_lock);
+again:
+	link = idr_get_next(&link_idr, id);
+	if (link) {
+		link = bpf_link_inc_not_zero(link);
+		if (IS_ERR(link)) {
+			(*id)++;
+			goto again;
+		}
+	}
+	spin_unlock_bh(&link_idr_lock);
+
+	return link;
+}
+
 #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
 
 static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
-- 
cgit v1.2.3


From d721def7392a7348ffb9f3583b264239cbd3702c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 10 May 2022 14:26:12 +0200
Subject: kallsyms: Make kallsyms_on_each_symbol generally available

Making kallsyms_on_each_symbol generally available, so it can be
used outside CONFIG_LIVEPATCH option in following changes.

Rather than adding another ifdef option let's make the function
generally available (when CONFIG_KALLSYMS option is defined).

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220510122616.2652285-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/kallsyms.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 79f2eb617a62..fdfd308bebc4 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -228,7 +228,6 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
-#ifdef CONFIG_LIVEPATCH
 /*
  * Iterate over all symbols in vmlinux.  For symbols from modules use
  * module_kallsyms_on_each_symbol instead.
@@ -251,7 +250,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
 	}
 	return 0;
 }
-#endif /* CONFIG_LIVEPATCH */
 
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
-- 
cgit v1.2.3


From bed0d9a50dacee6fcf785c555cfb0d2573355afc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 10 May 2022 14:26:13 +0200
Subject: ftrace: Add ftrace_lookup_symbols function

Adding ftrace_lookup_symbols function that resolves array of symbols
with single pass over kallsyms.

The user provides array of string pointers with count and pointer to
allocated array for resolved values.

  int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt,
                            unsigned long *addrs)

It iterates all kallsyms symbols and tries to loop up each in provided
symbols array with bsearch. The symbols array needs to be sorted by
name for this reason.

We also check each symbol to pass ftrace_location, because this API
will be used for fprobe symbols resolving.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220510122616.2652285-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/kallsyms.c     |  1 +
 kernel/trace/ftrace.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fdfd308bebc4..fbdf8d3279ac 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -29,6 +29,7 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/bsearch.h>
 
 /*
  * These will be re-linked against their real values
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f1d2f5e7263..07d87c7a525d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7964,3 +7964,65 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	mutex_unlock(&ftrace_lock);
 	return ret;
 }
+
+static int symbols_cmp(const void *a, const void *b)
+{
+	const char **str_a = (const char **) a;
+	const char **str_b = (const char **) b;
+
+	return strcmp(*str_a, *str_b);
+}
+
+struct kallsyms_data {
+	unsigned long *addrs;
+	const char **syms;
+	size_t cnt;
+	size_t found;
+};
+
+static int kallsyms_callback(void *data, const char *name,
+			     struct module *mod, unsigned long addr)
+{
+	struct kallsyms_data *args = data;
+
+	if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp))
+		return 0;
+
+	addr = ftrace_location(addr);
+	if (!addr)
+		return 0;
+
+	args->addrs[args->found++] = addr;
+	return args->found == args->cnt ? 1 : 0;
+}
+
+/**
+ * ftrace_lookup_symbols - Lookup addresses for array of symbols
+ *
+ * @sorted_syms: array of symbols pointers symbols to resolve,
+ * must be alphabetically sorted
+ * @cnt: number of symbols/addresses in @syms/@addrs arrays
+ * @addrs: array for storing resulting addresses
+ *
+ * This function looks up addresses for array of symbols provided in
+ * @syms array (must be alphabetically sorted) and stores them in
+ * @addrs array, which needs to be big enough to store at least @cnt
+ * addresses.
+ *
+ * This function returns 0 if all provided symbols are found,
+ * -ESRCH otherwise.
+ */
+int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
+{
+	struct kallsyms_data args;
+	int err;
+
+	args.addrs = addrs;
+	args.syms = sorted_syms;
+	args.cnt = cnt;
+	args.found = 0;
+	err = kallsyms_on_each_symbol(kallsyms_callback, &args);
+	if (err < 0)
+		return err;
+	return args.found == args.cnt ? 0 : -ESRCH;
+}
-- 
cgit v1.2.3


From 8be9253344a1d8cd91b22655e55de41e3288aaf9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 10 May 2022 14:26:14 +0200
Subject: fprobe: Resolve symbols with ftrace_lookup_symbols

Using ftrace_lookup_symbols to speed up symbols lookup
in register_fprobe_syms API.

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220510122616.2652285-4-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/fprobe.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 89d9f994ebb0..aac63ca9c3d1 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -85,39 +85,31 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
 }
 NOKPROBE_SYMBOL(fprobe_exit_handler);
 
+static int symbols_cmp(const void *a, const void *b)
+{
+	const char **str_a = (const char **) a;
+	const char **str_b = (const char **) b;
+
+	return strcmp(*str_a, *str_b);
+}
+
 /* Convert ftrace location address from symbols */
 static unsigned long *get_ftrace_locations(const char **syms, int num)
 {
-	unsigned long addr, size;
 	unsigned long *addrs;
-	int i;
 
 	/* Convert symbols to symbol address */
 	addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
 	if (!addrs)
 		return ERR_PTR(-ENOMEM);
 
-	for (i = 0; i < num; i++) {
-		addr = kallsyms_lookup_name(syms[i]);
-		if (!addr)	/* Maybe wrong symbol */
-			goto error;
-
-		/* Convert symbol address to ftrace location. */
-		if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
-			goto error;
+	/* ftrace_lookup_symbols expects sorted symbols */
+	sort(syms, num, sizeof(*syms), symbols_cmp, NULL);
 
-		addr = ftrace_location_range(addr, addr + size - 1);
-		if (!addr) /* No dynamic ftrace there. */
-			goto error;
+	if (!ftrace_lookup_symbols(syms, num, addrs))
+		return addrs;
 
-		addrs[i] = addr;
-	}
-
-	return addrs;
-
-error:
 	kfree(addrs);
-
 	return ERR_PTR(-ENOENT);
 }
 
-- 
cgit v1.2.3


From 0236fec57a15dc2a068dfe4488e0c2ab4559b1ec Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 10 May 2022 14:26:15 +0200
Subject: bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link

Using kallsyms_lookup_names function to speed up symbols lookup in
kprobe multi link attachment and replacing with it the current
kprobe_multi_resolve_syms function.

This speeds up bpftrace kprobe attachment:

  # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* {  } i:ms:1 { exit(); }'
  ...
  6.5681 +- 0.0225 seconds time elapsed  ( +-  0.34% )

After:

  # perf stat -r 5 -e cycles ./src/bpftrace -e 'kprobe:x* {  } i:ms:1 { exit(); }'
  ...
  0.5661 +- 0.0275 seconds time elapsed  ( +-  4.85% )

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220510122616.2652285-5-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 112 ++++++++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f15b826f9899..7fd11c17558d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2229,6 +2229,59 @@ struct bpf_kprobe_multi_run_ctx {
 	unsigned long entry_ip;
 };
 
+struct user_syms {
+	const char **syms;
+	char *buf;
+};
+
+static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
+{
+	unsigned long __user usymbol;
+	const char **syms = NULL;
+	char *buf = NULL, *p;
+	int err = -ENOMEM;
+	unsigned int i;
+
+	syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL);
+	if (!syms)
+		goto error;
+
+	buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL);
+	if (!buf)
+		goto error;
+
+	for (p = buf, i = 0; i < cnt; i++) {
+		if (__get_user(usymbol, usyms + i)) {
+			err = -EFAULT;
+			goto error;
+		}
+		err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
+		if (err == KSYM_NAME_LEN)
+			err = -E2BIG;
+		if (err < 0)
+			goto error;
+		syms[i] = p;
+		p += err + 1;
+	}
+
+	us->syms = syms;
+	us->buf = buf;
+	return 0;
+
+error:
+	if (err) {
+		kvfree(syms);
+		kvfree(buf);
+	}
+	return err;
+}
+
+static void free_user_syms(struct user_syms *us)
+{
+	kvfree(us->syms);
+	kvfree(us->buf);
+}
+
 static void bpf_kprobe_multi_link_release(struct bpf_link *link)
 {
 	struct bpf_kprobe_multi_link *kmulti_link;
@@ -2349,53 +2402,12 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
 	kprobe_multi_link_prog_run(link, entry_ip, regs);
 }
 
-static int
-kprobe_multi_resolve_syms(const void __user *usyms, u32 cnt,
-			  unsigned long *addrs)
+static int symbols_cmp(const void *a, const void *b)
 {
-	unsigned long addr, size;
-	const char __user **syms;
-	int err = -ENOMEM;
-	unsigned int i;
-	char *func;
-
-	size = cnt * sizeof(*syms);
-	syms = kvzalloc(size, GFP_KERNEL);
-	if (!syms)
-		return -ENOMEM;
+	const char **str_a = (const char **) a;
+	const char **str_b = (const char **) b;
 
-	func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
-	if (!func)
-		goto error;
-
-	if (copy_from_user(syms, usyms, size)) {
-		err = -EFAULT;
-		goto error;
-	}
-
-	for (i = 0; i < cnt; i++) {
-		err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
-		if (err == KSYM_NAME_LEN)
-			err = -E2BIG;
-		if (err < 0)
-			goto error;
-		err = -EINVAL;
-		addr = kallsyms_lookup_name(func);
-		if (!addr)
-			goto error;
-		if (!kallsyms_lookup_size_offset(addr, &size, NULL))
-			goto error;
-		addr = ftrace_location_range(addr, addr + size - 1);
-		if (!addr)
-			goto error;
-		addrs[i] = addr;
-	}
-
-	err = 0;
-error:
-	kvfree(syms);
-	kfree(func);
-	return err;
+	return strcmp(*str_a, *str_b);
 }
 
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -2441,7 +2453,15 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 			goto error;
 		}
 	} else {
-		err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+		struct user_syms us;
+
+		err = copy_user_syms(&us, usyms, cnt);
+		if (err)
+			goto error;
+
+		sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL);
+		err = ftrace_lookup_symbols(us.syms, cnt, addrs);
+		free_user_syms(&us);
 		if (err)
 			goto error;
 	}
-- 
cgit v1.2.3


From 792ea6a074ae7ea5ab6f1b8b31f76bb0297de66c Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 10 May 2022 09:56:05 +0200
Subject: genirq: Remove WARN_ON_ONCE() in generic_handle_domain_irq()

Since commit 0953fb263714 ("irq: remove handle_domain_{irq,nmi}()"),
generic_handle_domain_irq() warns if called outside hardirq context, even
though the function calls down to handle_irq_desc(), which warns about the
same, but conditionally on handle_enforce_irqctx().

The newly added warning is a false positive if the interrupt originates
from any other irqchip than x86 APIC or ARM GIC/GICv3.  Those are the only
ones for which handle_enforce_irqctx() returns true.  Per commit
c16816acd086 ("genirq: Add protection against unsafe usage of
generic_handle_irq()"):

 "In general calling generic_handle_irq() with interrupts disabled from non
  interrupt context is harmless. For some interrupt controllers like the
  x86 trainwrecks this is outright dangerous as it might corrupt state if
  an interrupt affinity change is pending."

Examples for interrupt chips where the warning is a false positive are
USB-attached GPIO controllers such as drivers/gpio/gpio-dln2.c:

  USB gadgets are incapable of directly signaling an interrupt because they
  cannot initiate a bus transaction by themselves.  All communication on
  the bus is initiated by the host controller, which polls a gadget's
  Interrupt Endpoint in regular intervals.  If an interrupt is pending,
  that information is passed up the stack in softirq context, from which a
  hardirq is synthesized via generic_handle_domain_irq().

Remove the warning to eliminate such false positives.

Fixes: 0953fb263714 ("irq: remove handle_domain_{irq,nmi}()")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Jakub Kicinski <kuba@kernel.org>
CC: Linus Walleij <linus.walleij@linaro.org>
Cc: Bartosz Golaszewski <brgl@bgdev.pl>
Cc: Octavian Purdila <octavian.purdila@nxp.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20220505113207.487861b2@kernel.org
Link: https://lore.kernel.org/r/20220506203242.GA1855@wunner.de
Link: https://lore.kernel.org/r/c3caf60bfa78e5fdbdf483096b7174da65d1813a.1652168866.git.lukas@wunner.de
---
 kernel/irq/irqdesc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0099b87dd853..d323b180b0f3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -701,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
  */
 int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
 {
-	WARN_ON_ONCE(!in_hardirq());
 	return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
 }
 EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
-- 
cgit v1.2.3


From f7e0beaf39d3868dc700d4954b26cf8443c5d423 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Tue, 10 May 2022 13:59:19 -0700
Subject: bpf, x86: Generate trampolines from bpf_tramp_links

Replace struct bpf_tramp_progs with struct bpf_tramp_links to collect
struct bpf_tramp_link(s) for a trampoline.  struct bpf_tramp_link
extends bpf_link to act as a linked list node.

arch_prepare_bpf_trampoline() accepts a struct bpf_tramp_links to
collects all bpf_tramp_link(s) that a trampoline should call.

Change BPF trampoline and bpf_struct_ops to pass bpf_tramp_links
instead of bpf_tramp_progs.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-2-kuifeng@fb.com
---
 kernel/bpf/bpf_struct_ops.c | 71 +++++++++++++++++++++++++++++--------------
 kernel/bpf/syscall.c        | 23 +++++---------
 kernel/bpf/trampoline.c     | 73 +++++++++++++++++++++++++--------------------
 3 files changed, 98 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3a0103ad97bc..d9a3c9207240 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -33,15 +33,15 @@ struct bpf_struct_ops_map {
 	const struct bpf_struct_ops *st_ops;
 	/* protect map_update */
 	struct mutex lock;
-	/* progs has all the bpf_prog that is populated
+	/* link has all the bpf_links that is populated
 	 * to the func ptr of the kernel's struct
 	 * (in kvalue.data).
 	 */
-	struct bpf_prog **progs;
+	struct bpf_link **links;
 	/* image is a page that has all the trampolines
 	 * that stores the func args before calling the bpf_prog.
 	 * A PAGE_SIZE "image" is enough to store all trampoline for
-	 * "progs[]".
+	 * "links[]".
 	 */
 	void *image;
 	/* uvalue->data stores the kernel struct
@@ -283,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 	u32 i;
 
 	for (i = 0; i < btf_type_vlen(t); i++) {
-		if (st_map->progs[i]) {
-			bpf_prog_put(st_map->progs[i]);
-			st_map->progs[i] = NULL;
+		if (st_map->links[i]) {
+			bpf_link_put(st_map->links[i]);
+			st_map->links[i] = NULL;
 		}
 	}
 }
@@ -316,18 +316,34 @@ static int check_zero_holes(const struct btf_type *t, void *data)
 	return 0;
 }
 
-int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
-				      struct bpf_prog *prog,
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+	kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+	.release = bpf_struct_ops_link_release,
+	.dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
 				      void *image, void *image_end)
 {
 	u32 flags;
 
-	tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
-	tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
 	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
-					   model, flags, tprogs, NULL);
+					   model, flags, tlinks, NULL);
 }
 
 static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -338,7 +354,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	const struct btf_member *member;
 	const struct btf_type *t = st_ops->type;
-	struct bpf_tramp_progs *tprogs = NULL;
+	struct bpf_tramp_links *tlinks = NULL;
 	void *udata, *kdata;
 	int prog_fd, err = 0;
 	void *image, *image_end;
@@ -362,8 +378,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (uvalue->state || refcount_read(&uvalue->refcnt))
 		return -EINVAL;
 
-	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
-	if (!tprogs)
+	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+	if (!tlinks)
 		return -ENOMEM;
 
 	uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
@@ -386,6 +402,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	for_each_member(i, t, member) {
 		const struct btf_type *mtype, *ptype;
 		struct bpf_prog *prog;
+		struct bpf_tramp_link *link;
 		u32 moff;
 
 		moff = __btf_member_bit_offset(t, member) / 8;
@@ -439,16 +456,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			err = PTR_ERR(prog);
 			goto reset_unlock;
 		}
-		st_map->progs[i] = prog;
 
 		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
 		    prog->aux->attach_btf_id != st_ops->type_id ||
 		    prog->expected_attach_type != i) {
+			bpf_prog_put(prog);
 			err = -EINVAL;
 			goto reset_unlock;
 		}
 
-		err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+		link = kzalloc(sizeof(*link), GFP_USER);
+		if (!link) {
+			bpf_prog_put(prog);
+			err = -ENOMEM;
+			goto reset_unlock;
+		}
+		bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+			      &bpf_struct_ops_link_lops, prog);
+		st_map->links[i] = &link->link;
+
+		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 							&st_ops->func_models[i],
 							image, image_end);
 		if (err < 0)
@@ -491,7 +518,7 @@ reset_unlock:
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
 unlock:
-	kfree(tprogs);
+	kfree(tlinks);
 	mutex_unlock(&st_map->lock);
 	return err;
 }
@@ -546,9 +573,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
-	if (st_map->progs)
+	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
-	bpf_map_area_free(st_map->progs);
+	bpf_map_area_free(st_map->links);
 	bpf_jit_free_exec(st_map->image);
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
@@ -597,11 +624,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	map = &st_map->map;
 
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
-	st_map->progs =
-		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
+	st_map->links =
+		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
 				   NUMA_NO_NODE);
 	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
-	if (!st_map->uvalue || !st_map->progs || !st_map->image) {
+	if (!st_map->uvalue || !st_map->links || !st_map->image) {
 		bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50164d324eaf..2dc582773344 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2864,19 +2864,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 }
 EXPORT_SYMBOL(bpf_link_get_from_fd);
 
-struct bpf_tracing_link {
-	struct bpf_link link;
-	enum bpf_attach_type attach_type;
-	struct bpf_trampoline *trampoline;
-	struct bpf_prog *tgt_prog;
-};
-
 static void bpf_tracing_link_release(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
 						tr_link->trampoline));
 
 	bpf_trampoline_put(tr_link->trampoline);
@@ -2889,7 +2882,7 @@ static void bpf_tracing_link_release(struct bpf_link *link)
 static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	kfree(tr_link);
 }
@@ -2898,7 +2891,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
 					 struct seq_file *seq)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	seq_printf(seq,
 		   "attach_type:\t%d\n",
@@ -2909,7 +2902,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 					   struct bpf_link_info *info)
 {
 	struct bpf_tracing_link *tr_link =
-		container_of(link, struct bpf_tracing_link, link);
+		container_of(link, struct bpf_tracing_link, link.link);
 
 	info->tracing.attach_type = tr_link->attach_type;
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
@@ -2990,7 +2983,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		err = -ENOMEM;
 		goto out_put_prog;
 	}
-	bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
+	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
 		      &bpf_tracing_link_lops, prog);
 	link->attach_type = prog->expected_attach_type;
 
@@ -3060,11 +3053,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		tgt_prog = prog->aux->dst_prog;
 	}
 
-	err = bpf_link_prime(&link->link, &link_primer);
+	err = bpf_link_prime(&link->link.link, &link_primer);
 	if (err)
 		goto out_unlock;
 
-	err = bpf_trampoline_link_prog(prog, tr);
+	err = bpf_trampoline_link_prog(&link->link, tr);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ada97751ae1b..d5e6bc5517cb 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -168,30 +168,30 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	return ret;
 }
 
-static struct bpf_tramp_progs *
+static struct bpf_tramp_links *
 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
 {
-	const struct bpf_prog_aux *aux;
-	struct bpf_tramp_progs *tprogs;
-	struct bpf_prog **progs;
+	struct bpf_tramp_link *link;
+	struct bpf_tramp_links *tlinks;
+	struct bpf_tramp_link **links;
 	int kind;
 
 	*total = 0;
-	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
-	if (!tprogs)
+	tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+	if (!tlinks)
 		return ERR_PTR(-ENOMEM);
 
 	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
-		tprogs[kind].nr_progs = tr->progs_cnt[kind];
+		tlinks[kind].nr_links = tr->progs_cnt[kind];
 		*total += tr->progs_cnt[kind];
-		progs = tprogs[kind].progs;
+		links = tlinks[kind].links;
 
-		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
-			*ip_arg |= aux->prog->call_get_func_ip;
-			*progs++ = aux->prog;
+		hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+			*ip_arg |= link->link.prog->call_get_func_ip;
+			*links++ = link;
 		}
 	}
-	return tprogs;
+	return tlinks;
 }
 
 static void __bpf_tramp_image_put_deferred(struct work_struct *work)
@@ -330,14 +330,14 @@ out:
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
 	struct bpf_tramp_image *im;
-	struct bpf_tramp_progs *tprogs;
+	struct bpf_tramp_links *tlinks;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 	bool ip_arg = false;
 	int err, total;
 
-	tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
-	if (IS_ERR(tprogs))
-		return PTR_ERR(tprogs);
+	tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+	if (IS_ERR(tlinks))
+		return PTR_ERR(tlinks);
 
 	if (total == 0) {
 		err = unregister_fentry(tr, tr->cur_image->image);
@@ -353,15 +353,15 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	}
 
-	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
-	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
+	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
 	if (ip_arg)
 		flags |= BPF_TRAMP_F_IP_ARG;
 
 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
-					  &tr->func.model, flags, tprogs,
+					  &tr->func.model, flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
@@ -381,7 +381,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	tr->cur_image = im;
 	tr->selector++;
 out:
-	kfree(tprogs);
+	kfree(tlinks);
 	return err;
 }
 
@@ -407,13 +407,14 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 	}
 }
 
-int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
+	struct bpf_tramp_link *link_exiting;
 	int err = 0;
 	int cnt;
 
-	kind = bpf_attach_type_to_tramp(prog);
+	kind = bpf_attach_type_to_tramp(link->link.prog);
 	mutex_lock(&tr->mutex);
 	if (tr->extension_prog) {
 		/* cannot attach fentry/fexit if extension prog is attached.
@@ -429,25 +430,33 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 			err = -EBUSY;
 			goto out;
 		}
-		tr->extension_prog = prog;
+		tr->extension_prog = link->link.prog;
 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
-					 prog->bpf_func);
+					 link->link.prog->bpf_func);
 		goto out;
 	}
-	if (cnt >= BPF_MAX_TRAMP_PROGS) {
+	if (cnt >= BPF_MAX_TRAMP_LINKS) {
 		err = -E2BIG;
 		goto out;
 	}
-	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+	if (!hlist_unhashed(&link->tramp_hlist)) {
 		/* prog already linked */
 		err = -EBUSY;
 		goto out;
 	}
-	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+	hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+		if (link_exiting->link.prog != link->link.prog)
+			continue;
+		/* prog already linked */
+		err = -EBUSY;
+		goto out;
+	}
+
+	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
 	tr->progs_cnt[kind]++;
 	err = bpf_trampoline_update(tr);
 	if (err) {
-		hlist_del_init(&prog->aux->tramp_hlist);
+		hlist_del_init(&link->tramp_hlist);
 		tr->progs_cnt[kind]--;
 	}
 out:
@@ -456,12 +465,12 @@ out:
 }
 
 /* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
 {
 	enum bpf_tramp_prog_type kind;
 	int err;
 
-	kind = bpf_attach_type_to_tramp(prog);
+	kind = bpf_attach_type_to_tramp(link->link.prog);
 	mutex_lock(&tr->mutex);
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
@@ -470,7 +479,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 		tr->extension_prog = NULL;
 		goto out;
 	}
-	hlist_del_init(&prog->aux->tramp_hlist);
+	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
 	err = bpf_trampoline_update(tr);
 out:
@@ -635,7 +644,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
 int __weak
 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
-			    struct bpf_tramp_progs *tprogs,
+			    struct bpf_tramp_links *tlinks,
 			    void *orig_call)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From e384c7b7b46d0a5f4bf3c554f963e6e9622d0ab1 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Tue, 10 May 2022 13:59:20 -0700
Subject: bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack

BPF trampolines will create a bpf_tramp_run_ctx, a bpf_run_ctx, on
stacks and set/reset the current bpf_run_ctx before/after calling a
bpf_prog.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-3-kuifeng@fb.com
---
 kernel/bpf/syscall.c    |  7 +++++--
 kernel/bpf/trampoline.c | 20 ++++++++++++++++----
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2dc582773344..d48165fccf49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5020,6 +5020,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
 BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
 {
 	struct bpf_prog * __maybe_unused prog;
+	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
 
 	switch (cmd) {
 	case BPF_MAP_CREATE:
@@ -5047,13 +5048,15 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
 			return -EINVAL;
 		}
 
-		if (!__bpf_prog_enter_sleepable(prog)) {
+		run_ctx.bpf_cookie = 0;
+		run_ctx.saved_run_ctx = NULL;
+		if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
 			/* recursion detected */
 			bpf_prog_put(prog);
 			return -EBUSY;
 		}
 		attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
-		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+		__bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
 		bpf_prog_put(prog);
 		return 0;
 #endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d5e6bc5517cb..baf1b65d523e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -568,11 +568,14 @@ static void notrace inc_misses_counter(struct bpf_prog *prog)
  * [2..MAX_U64] - execute bpf prog and record execution time.
  *     This is start time.
  */
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
+u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
 	rcu_read_lock();
 	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 		inc_misses_counter(prog);
 		return 0;
@@ -602,29 +605,38 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
 	}
 }
 
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
 	update_prog_stats(prog, start);
 	__this_cpu_dec(*(prog->active));
 	migrate_enable();
 	rcu_read_unlock();
 }
 
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
+u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 {
 	rcu_read_lock_trace();
 	migrate_disable();
 	might_fault();
+
 	if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 		inc_misses_counter(prog);
 		return 0;
 	}
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
 	return bpf_prog_start_time();
 }
 
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
+void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+				       struct bpf_tramp_run_ctx *run_ctx)
 {
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
 	update_prog_stats(prog, start);
 	__this_cpu_dec(*(prog->active));
 	migrate_enable();
-- 
cgit v1.2.3


From 2fcc82411e74e5e6aba336561cf56fb899bfae4e Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Tue, 10 May 2022 13:59:21 -0700
Subject: bpf, x86: Attach a cookie to fentry/fexit/fmod_ret/lsm.

Pass a cookie along with BPF_LINK_CREATE requests.

Add a bpf_cookie field to struct bpf_tracing_link to attach a cookie.
The cookie of a bpf_tracing_link is available by calling
bpf_get_attach_cookie when running the BPF program of the attached
link.

The value of a cookie will be set at bpf_tramp_run_ctx by the
trampoline of the link.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220510205923.3206889-4-kuifeng@fb.com
---
 kernel/bpf/bpf_lsm.c     | 17 +++++++++++++++++
 kernel/bpf/syscall.c     | 12 ++++++++----
 kernel/bpf/trampoline.c  |  7 +++++--
 kernel/trace/bpf_trace.c | 17 +++++++++++++++++
 4 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 064eccba641d..c1351df9f7ee 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -117,6 +117,21 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = {
 	.allowed	= bpf_ima_inode_hash_allowed,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+	.func		= bpf_get_attach_cookie,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 static const struct bpf_func_proto *
 bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -141,6 +156,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
 	case BPF_FUNC_ima_file_hash:
 		return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+	case BPF_FUNC_get_attach_cookie:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
 	default:
 		return tracing_prog_func_proto(func_id, prog);
 	}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d48165fccf49..72e53489165d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2921,7 +2921,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
 
 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 				   int tgt_prog_fd,
-				   u32 btf_id)
+				   u32 btf_id,
+				   u64 bpf_cookie)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_prog *tgt_prog = NULL;
@@ -2986,6 +2987,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
 		      &bpf_tracing_link_lops, prog);
 	link->attach_type = prog->expected_attach_type;
+	link->link.cookie = bpf_cookie;
 
 	mutex_lock(&prog->aux->dst_mutex);
 
@@ -3271,7 +3273,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 			tp_name = prog->aux->attach_func_name;
 			break;
 		}
-		return bpf_tracing_prog_attach(prog, 0, 0);
+		return bpf_tracing_prog_attach(prog, 0, 0, 0);
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
@@ -4524,7 +4526,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_EXT:
 		ret = bpf_tracing_prog_attach(prog,
 					      attr->link_create.target_fd,
-					      attr->link_create.target_btf_id);
+					      attr->link_create.target_btf_id,
+					      attr->link_create.tracing.cookie);
 		break;
 	case BPF_PROG_TYPE_LSM:
 	case BPF_PROG_TYPE_TRACING:
@@ -4539,7 +4542,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
-						      attr->link_create.target_btf_id);
+						      attr->link_create.target_btf_id,
+						      attr->link_create.tracing.cookie);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_SK_LOOKUP:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index baf1b65d523e..0e9b3aefc34a 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -30,9 +30,12 @@ static DEFINE_MUTEX(trampoline_mutex);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	enum bpf_attach_type eatype = prog->expected_attach_type;
+	enum bpf_prog_type ptype = prog->type;
 
-	return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-	       eatype == BPF_MODIFY_RETURN;
+	return (ptype == BPF_PROG_TYPE_TRACING &&
+		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+		 eatype == BPF_MODIFY_RETURN)) ||
+		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
 void *bpf_jit_alloc_exec_page(void)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7fd11c17558d..2eaac094caf8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1091,6 +1091,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
+{
+	struct bpf_trace_run_ctx *run_ctx;
+
+	run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+	return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
+	.func		= bpf_get_attach_cookie_tracing,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
 {
 #ifndef CONFIG_X86
@@ -1719,6 +1734,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
 	case BPF_FUNC_get_func_arg_cnt:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
+	case BPF_FUNC_get_attach_cookie:
+		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
 	default:
 		fn = raw_tp_prog_func_proto(func_id, prog);
 		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
-- 
cgit v1.2.3


From 434e09e7575b02e014931bc5672289fabd7a825c Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 10 May 2022 15:21:33 -0400
Subject: locking/qrwlock: Change "queue rwlock" to "queued rwlock"

Queued rwlock was originally named "queue rwlock" which wasn't quite
grammatically correct. However there are still some "queue rwlock"
references in the code. Change those to "queued rwlock" for consistency.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220510192134.434753-1-longman@redhat.com
---
 kernel/locking/qrwlock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 7f42e52a648f..2e1600906c9f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -15,8 +15,8 @@
 #include <trace/events/lock.h>
 
 /**
- * queued_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
  */
 void queued_read_lock_slowpath(struct qrwlock *lock)
 {
@@ -60,8 +60,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
 EXPORT_SYMBOL(queued_read_lock_slowpath);
 
 /**
- * queued_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
  */
 void queued_write_lock_slowpath(struct qrwlock *lock)
 {
-- 
cgit v1.2.3


From 2679a83731d51a744657f718fc02c3b077e47562 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 30 Apr 2022 16:58:42 +0800
Subject: sched/core: Avoid obvious double update_rq_clock warning

When we use raw_spin_rq_lock() to acquire the rq lock and have to
update the rq clock while holding the lock, the kernel may issue
a WARN_DOUBLE_CLOCK warning.

Since we directly use raw_spin_rq_lock() to acquire rq lock instead of
rq_lock(), there is no corresponding change to rq->clock_update_flags.
In particular, we have obtained the rq lock of other CPUs, the
rq->clock_update_flags of this CPU may be RQCF_UPDATED at this time, and
then calling update_rq_clock() will trigger the WARN_DOUBLE_CLOCK warning.

So we need to clear RQCF_UPDATED of rq->clock_update_flags to avoid
the WARN_DOUBLE_CLOCK warning.

For the sched_rt_period_timer() and migrate_task_rq_dl() cases
we simply replace raw_spin_rq_lock()/raw_spin_rq_unlock() with
rq_lock()/rq_unlock().

For the {pull,push}_{rt,dl}_task() cases, we add the
double_rq_clock_clear_update() function to clear RQCF_UPDATED of
rq->clock_update_flags, and call double_rq_clock_clear_update()
before double_lock_balance()/double_rq_lock() returns to avoid the
WARN_DOUBLE_CLOCK warning.

Some call trace reports:
Call Trace 1:
 <IRQ>
 sched_rt_period_timer+0x10f/0x3a0
 ? enqueue_top_rt_rq+0x110/0x110
 __hrtimer_run_queues+0x1a9/0x490
 hrtimer_interrupt+0x10b/0x240
 __sysvec_apic_timer_interrupt+0x8a/0x250
 sysvec_apic_timer_interrupt+0x9a/0xd0
 </IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x12/0x20

Call Trace 2:
 <TASK>
 activate_task+0x8b/0x110
 push_rt_task.part.108+0x241/0x2c0
 push_rt_tasks+0x15/0x30
 finish_task_switch+0xaa/0x2e0
 ? __switch_to+0x134/0x420
 __schedule+0x343/0x8e0
 ? hrtimer_start_range_ns+0x101/0x340
 schedule+0x4e/0xb0
 do_nanosleep+0x8e/0x160
 hrtimer_nanosleep+0x89/0x120
 ? hrtimer_init_sleeper+0x90/0x90
 __x64_sys_nanosleep+0x96/0xd0
 do_syscall_64+0x34/0x90
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Call Trace 3:
 <TASK>
 deactivate_task+0x93/0xe0
 pull_rt_task+0x33e/0x400
 balance_rt+0x7e/0x90
 __schedule+0x62f/0x8e0
 do_task_dead+0x3f/0x50
 do_exit+0x7b8/0xbb0
 do_group_exit+0x2d/0x90
 get_signal+0x9df/0x9e0
 ? preempt_count_add+0x56/0xa0
 ? __remove_hrtimer+0x35/0x70
 arch_do_signal_or_restart+0x36/0x720
 ? nanosleep_copyout+0x39/0x50
 ? do_nanosleep+0x131/0x160
 ? audit_filter_inodes+0xf5/0x120
 exit_to_user_mode_prepare+0x10f/0x1e0
 syscall_exit_to_user_mode+0x17/0x30
 do_syscall_64+0x40/0x90
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Call Trace 4:
 update_rq_clock+0x128/0x1a0
 migrate_task_rq_dl+0xec/0x310
 set_task_cpu+0x84/0x1e4
 try_to_wake_up+0x1d8/0x5c0
 wake_up_process+0x1c/0x30
 hrtimer_wakeup+0x24/0x3c
 __hrtimer_run_queues+0x114/0x270
 hrtimer_interrupt+0xe8/0x244
 arch_timer_handler_phys+0x30/0x50
 handle_percpu_devid_irq+0x88/0x140
 generic_handle_domain_irq+0x40/0x60
 gic_handle_irq+0x48/0xe0
 call_on_irq_stack+0x2c/0x60
 do_interrupt_handler+0x80/0x84

Steps to reproduce:
1. Enable CONFIG_SCHED_DEBUG when compiling the kernel
2. echo 1 > /sys/kernel/debug/clear_warn_once
   echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features
   echo "NO_RT_PUSH_IPI" > /sys/kernel/debug/sched/features
3. Run some rt/dl tasks that periodically work and sleep, e.g.
Create 2*n rt or dl (90% running) tasks via rt-app (on a system
with n CPUs), and Dietmar Eggemann reports Call Trace 4 when running
on PREEMPT_RT kernel.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20220430085843.62939-2-jiahao.os@bytedance.com
---
 kernel/sched/core.c     |  6 +++---
 kernel/sched/deadline.c |  5 +++--
 kernel/sched/rt.c       |  5 +++--
 kernel/sched/sched.h    | 28 ++++++++++++++++++++++++----
 4 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48cfad152b86..2c9f5e97682f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -613,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
 		swap(rq1, rq2);
 
 	raw_spin_rq_lock(rq1);
-	if (__rq_lockp(rq1) == __rq_lockp(rq2))
-		return;
+	if (__rq_lockp(rq1) != __rq_lockp(rq2))
+		raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
 
-	raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+	double_rq_clock_clear_update(rq1, rq2);
 }
 #endif
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6ae423627a7a..0ad2818e77a2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1830,6 +1830,7 @@ out:
 
 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
 {
+	struct rq_flags rf;
 	struct rq *rq;
 
 	if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1841,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
 	 * from try_to_wake_up(). Hence, p->pi_lock is locked, but
 	 * rq->lock is not... So, lock it
 	 */
-	raw_spin_rq_lock(rq);
+	rq_lock(rq, &rf);
 	if (p->dl.dl_non_contending) {
 		update_rq_clock(rq);
 		sub_running_bw(&p->dl, &rq->dl);
@@ -1857,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
 			put_task_struct(p);
 	}
 	sub_rq_bw(&p->dl, &rq->dl);
-	raw_spin_rq_unlock(rq);
+	rq_unlock(rq, &rf);
 }
 
 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a32c46889af8..7891c0f0e1ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+		struct rq_flags rf;
 		int skip;
 
 		/*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		if (skip)
 			continue;
 
-		raw_spin_rq_lock(rq);
+		rq_lock(rq, &rf);
 		update_rq_clock(rq);
 
 		if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
-		raw_spin_rq_unlock(rq);
+		rq_unlock(rq, &rf);
 	}
 
 	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f338c53ce42..fe4d1acb7e38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2474,6 +2474,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+	rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+	/* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
+#ifdef CONFIG_SMP
+	rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
 
 #ifdef CONFIG_SMP
 
@@ -2539,14 +2557,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
-	if (__rq_lockp(this_rq) == __rq_lockp(busiest))
-		return 0;
-
-	if (likely(raw_spin_rq_trylock(busiest)))
+	if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+	    likely(raw_spin_rq_trylock(busiest))) {
+		double_rq_clock_clear_update(this_rq, busiest);
 		return 0;
+	}
 
 	if (rq_order_less(this_rq, busiest)) {
 		raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+		double_rq_clock_clear_update(this_rq, busiest);
 		return 0;
 	}
 
@@ -2640,6 +2659,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	BUG_ON(rq1 != rq2);
 	raw_spin_rq_lock(rq1);
 	__acquire(rq2->lock);	/* Fake it out ;) */
+	double_rq_clock_clear_update(rq1, rq2);
 }
 
 /*
-- 
cgit v1.2.3


From 734387ec2f9d77b00276042b1fa7c95f48ee879d Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 30 Apr 2022 16:58:43 +0800
Subject: sched/deadline: Remove superfluous rq clock update in push_dl_task()

The change to call update_rq_clock() before activate_task()
commit 840d719604b0 ("sched/deadline: Update rq_clock of later_rq
when pushing a task") is no longer needed since commit f4904815f97a
("sched/deadline: Fix double accounting of rq/running bw in push & pull")
removed the add_running_bw() before the activate_task().

So we remove some comments that are no longer needed and update
rq clock in activate_task().

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lore.kernel.org/r/20220430085843.62939-3-jiahao.os@bytedance.com
---
 kernel/sched/deadline.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0ad2818e77a2..936817ae142f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2318,13 +2318,7 @@ retry:
 
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, later_rq->cpu);
-
-	/*
-	 * Update the later_rq clock here, because the clock is used
-	 * by the cpufreq_update_util() inside __add_running_bw().
-	 */
-	update_rq_clock(later_rq);
-	activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+	activate_task(later_rq, next_task, 0);
 	ret = 1;
 
 	resched_curr(later_rq);
-- 
cgit v1.2.3


From c9d8923bfbcb63f15ea6cb2b5c8426fc3d96f643 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <Pierre.Gondois@arm.com>
Date: Tue, 26 Apr 2022 16:44:48 +0200
Subject: PM: EM: Decrement policy counter

In commit e458716a92b57 ("PM: EM: Mark inefficiencies in CPUFreq"),
cpufreq_cpu_get() is called without a cpufreq_cpu_put(), permanently
increasing the reference counts of the policy struct.

Decrement the reference count once the policy struct is not used
anymore.

Fixes: e458716a92b57 ("PM: EM: Mark inefficiencies in CPUFreq")
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/energy_model.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 092513575e4e..6c373f2960e7 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -259,6 +259,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev)
 			found++;
 	}
 
+	cpufreq_cpu_put(policy);
+
 	if (!found)
 		return;
 
-- 
cgit v1.2.3


From b3f9916d81e8ffb21cbe7abccf63f86a5a1d598a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 11 May 2022 12:16:21 -0500
Subject: sched: Update task_tick_numa to ignore tasks without an mm

Qian Cai <quic_qiancai@quicinc.com> wrote:
> Reverting the last 3 commits of the series fixed a boot crash.
>
> 1b2552cbdbe0 fork: Stop allowing kthreads to call execve
> 753550eb0ce1 fork: Explicitly set PF_KTHREAD
> 68d85f0a33b0 init: Deal with the init process being a user mode process
>
>  BUG: KASAN: null-ptr-deref in task_nr_scan_windows.isra.0
>  arch_atomic_long_read at ./include/linux/atomic/atomic-long.h:29
>  (inlined by) atomic_long_read at ./include/linux/atomic/atomic-instrumented.h:1266
>  (inlined by) get_mm_counter at ./include/linux/mm.h:1996
>  (inlined by) get_mm_rss at ./include/linux/mm.h:2049
>  (inlined by) task_nr_scan_windows at kernel/sched/fair.c:1123
>  Read of size 8 at addr 00000000000003d0 by task swapper/0/1

With the change to init and the user mode helper processes to not have
PF_KTHREAD set before they call kernel_execve the PF_KTHREAD test in
task_tick_numa became insufficient to detect all tasks that have
"->mm == NULL".  Correct that by testing for "->mm == NULL" directly.

Reported-by: Qian Cai <quic_qiancai@quicinc.com>
Tested-by: Qian Cai <quic_qiancai@quicinc.com>
Fixes: 1b2552cbdbe0 ("fork: Stop allowing kthreads to call execve")
Link: https://lkml.kernel.org/r/87r150ug1l.fsf_-_@email.froward.int.ebiederm.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bd299d67ab..db6f0df9d43e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2915,7 +2915,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
-	if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
 		return;
 
 	/*
-- 
cgit v1.2.3


From 92826e967535db2eb117db227b1191aaf98e4bb3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 23 Apr 2022 19:20:24 +0200
Subject: dma-direct: don't fail on highmem CMA pages in dma_direct_alloc_pages

When dma_direct_alloc_pages encounters a highmem page it just gives up
currently.  But what we really should do is to try memory using the
page allocator instead - without this platforms with a global highmem
CMA pool will fail all dma_alloc_pages allocations.

Fixes: efa70f2fdc84 ("dma-mapping: add a new dma_alloc_pages API")
Reported-by: Mark O'Neill <mao@tumblingdice.co.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9743c6ccce1a..3e7f4aab740e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -115,7 +115,7 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
 }
 
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		gfp_t gfp)
+		gfp_t gfp, bool allow_highmem)
 {
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
@@ -129,9 +129,12 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
 					   &phys_limit);
 	page = dma_alloc_contiguous(dev, size, gfp);
-	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
-		dma_free_contiguous(dev, page, size);
-		page = NULL;
+	if (page) {
+		if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
+		    (!allow_highmem && PageHighMem(page))) {
+			dma_free_contiguous(dev, page, size);
+			page = NULL;
+		}
 	}
 again:
 	if (!page)
@@ -189,7 +192,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
 {
 	struct page *page;
 
-	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
 	if (!page)
 		return NULL;
 
@@ -262,7 +265,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	/* we always manually zero the memory once we are done */
-	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
 	if (!page)
 		return NULL;
 
@@ -370,19 +373,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
-	page = __dma_direct_alloc_pages(dev, size, gfp);
+	page = __dma_direct_alloc_pages(dev, size, gfp, false);
 	if (!page)
 		return NULL;
-	if (PageHighMem(page)) {
-		/*
-		 * Depending on the cma= arguments and per-arch setup
-		 * dma_alloc_contiguous could return highmem pages.
-		 * Without remapping there is no way to return them here,
-		 * so log an error and fail.
-		 */
-		dev_info(dev, "Rejecting highmem page from CMA.\n");
-		goto out_free_pages;
-	}
 
 	ret = page_address(page);
 	if (dma_set_decrypted(dev, ret, size))
-- 
cgit v1.2.3


From 84bc4f1dbbbb5f8aa68706a96711dccb28b518e5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 10 May 2022 13:17:32 -0400
Subject: dma-debug: change allocation mode from GFP_NOWAIT to GFP_ATIOMIC

We observed the error "cacheline tracking ENOMEM, dma-debug disabled"
during a light system load (copying some files). The reason for this error
is that the dma_active_cacheline radix tree uses GFP_NOWAIT allocation -
so it can't access the emergency memory reserves and it fails as soon as
anybody reaches the watermark.

This patch changes GFP_NOWAIT to GFP_ATOMIC, so that it can access the
emergency memory reserves.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f8ff598596b8..ac740630c79c 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -448,7 +448,7 @@ void debug_dma_dump_mappings(struct device *dev)
  * other hand, consumes a single dma_debug_entry, but inserts 'nents'
  * entries into the tree.
  */
-static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC);
 static DEFINE_SPINLOCK(radix_lock);
 #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
 #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
-- 
cgit v1.2.3


From 28b3ae426598e722cf5d5ab9cc7038791b955a56 Mon Sep 17 00:00:00 2001
From: Uladzislau Rezki <uladzislau.rezki@sony.com>
Date: Wed, 16 Feb 2022 14:52:09 +0100
Subject: rcu: Introduce CONFIG_RCU_EXP_CPU_STALL_TIMEOUT

Currently both expedited and regular grace period stall warnings use
a single timeout value that with units of seconds.  However, recent
Android use cases problem require a sub-100-millisecond expedited RCU CPU
stall warning.  Given that expedited RCU grace periods normally complete
in far less than a single millisecond, especially for small systems,
this is not unreasonable.

Therefore introduce the CONFIG_RCU_EXP_CPU_STALL_TIMEOUT kernel
configuration that defaults to 20 msec on Android and remains the same
as that of the non-expedited stall warnings otherwise.  It also can be
changed in run-time via: /sys/.../parameters/rcu_exp_cpu_stall_timeout.

[ paulmck: Default of zero to use CONFIG_RCU_STALL_TIMEOUT. ]

Signed-off-by: Uladzislau Rezki <uladzislau.rezki@sony.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug | 14 ++++++++++++++
 kernel/rcu/rcu.h         |  2 ++
 kernel/rcu/tree_exp.h    |  4 ++--
 kernel/rcu/tree_stall.h  | 28 ++++++++++++++++++++++++++++
 kernel/rcu/update.c      |  2 ++
 5 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 4fd64999300f..0b397b5bf846 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -91,6 +91,20 @@ config RCU_CPU_STALL_TIMEOUT
 	  RCU grace period persists, additional CPU stall warnings are
 	  printed at more widely spaced intervals.
 
+config RCU_EXP_CPU_STALL_TIMEOUT
+	int "Expedited RCU CPU stall timeout in milliseconds"
+	depends on RCU_STALL_COMMON
+	range 0 21000
+	default 20 if ANDROID
+	default 0 if !ANDROID
+	help
+	  If a given expedited RCU grace period extends more than the
+	  specified number of milliseconds, a CPU stall warning is printed.
+	  If the RCU grace period persists, additional CPU stall warnings
+	  are printed at more widely spaced intervals.  A value of zero
+	  says to use the RCU_CPU_STALL_TIMEOUT value converted from
+	  seconds to milliseconds.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..20f0300f6cb1 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -210,7 +210,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
 extern int rcu_cpu_stall_ftrace_dump;
 extern int rcu_cpu_stall_suppress;
 extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
 int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
 
 static inline bool rcu_stall_is_suppressed(void)
 {
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 60197ea24ceb..b1f52b59fa4b 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -496,7 +496,7 @@ static void synchronize_rcu_expedited_wait(void)
 	struct rcu_node *rnp_root = rcu_get_root();
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
-	jiffies_stall = rcu_jiffies_till_stall_check();
+	jiffies_stall = rcu_exp_jiffies_till_stall_check();
 	jiffies_start = jiffies;
 	if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
 		if (synchronize_rcu_expedited_wait_once(1))
@@ -571,7 +571,7 @@ static void synchronize_rcu_expedited_wait(void)
 				dump_cpu_task(cpu);
 			}
 		}
-		jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+		jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
 	}
 }
 
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 0c5d8516516a..009d3f9305cf 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly;
 #define RCU_STALL_MIGHT_DIV		8
 #define RCU_STALL_MIGHT_MIN		(2 * HZ)
 
+int rcu_exp_jiffies_till_stall_check(void)
+{
+	int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+	int exp_stall_delay_delta = 0;
+	int till_stall_check;
+
+	// Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+	if (!cpu_stall_timeout)
+		cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+	// Limit check must be consistent with the Kconfig limits for
+	// CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+	// The minimum clamped value is "2UL", because at least one full
+	// tick has to be guaranteed.
+	till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
+
+	if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+		WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+	/* Add extra ~25% out of till_stall_check. */
+	exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+	return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
 /* Limit-check stall timeouts specified at boottime and runtime. */
 int rcu_jiffies_till_stall_check(void)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 180ff9c41fa8..fc7fef575606 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
 module_param(rcu_cpu_stall_suppress, int, 0644);
 int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 
 // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
-- 
cgit v1.2.3


From 9621fbee44df940e2e1b94b0676460a538dffefa Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Fri, 8 Apr 2022 17:35:27 -0700
Subject: rcu: Move expedited grace period (GP) work to RT kthread_worker

Enabling CONFIG_RCU_BOOST did not reduce RCU expedited grace-period
latency because its workqueues run at SCHED_OTHER, and thus can be
delayed by normal processes.  This commit avoids these delays by moving
the expedited GP work items to a real-time-priority kthread_worker.

This option is controlled by CONFIG_RCU_EXP_KTHREAD and disabled by
default on PREEMPT_RT=y kernels which disable expedited grace periods
after boot by unconditionally setting rcupdate.rcu_normal_after_boot=1.

The results were evaluated on arm64 Android devices (6GB ram) running
5.10 kernel, and capturing trace data in critical user-level code.

The table below shows the resulting order-of-magnitude improvements
in synchronize_rcu_expedited() latency:

------------------------------------------------------------------------
|                          |   workqueues  |  kthread_worker |  Diff   |
------------------------------------------------------------------------
| Count                    |          725  |            688  |         |
------------------------------------------------------------------------
| Min Duration       (ns)  |          326  |            447  |  37.12% |
------------------------------------------------------------------------
| Q1                 (ns)  |       39,428  |         38,971  |  -1.16% |
------------------------------------------------------------------------
| Q2 - Median        (ns)  |       98,225  |         69,743  | -29.00% |
------------------------------------------------------------------------
| Q3                 (ns)  |      342,122  |        126,638  | -62.98% |
------------------------------------------------------------------------
| Max Duration       (ns)  |  372,766,967  |      2,329,671  | -99.38% |
------------------------------------------------------------------------
| Avg Duration       (ns)  |    2,746,353  |        151,242  | -94.49% |
------------------------------------------------------------------------
| Standard Deviation (ns)  |   19,327,765  |        294,408  |         |
------------------------------------------------------------------------

The below table show the range of maximums/minimums for
synchronize_rcu_expedited() latency from all experiments:

------------------------------------------------------------------------
|                          |   workqueues  |  kthread_worker |  Diff   |
------------------------------------------------------------------------
| Total No. of Experiments |           25  |             23  |         |
------------------------------------------------------------------------
| Largest  Maximum   (ns)  |  372,766,967  |      2,329,671  | -99.38% |
------------------------------------------------------------------------
| Smallest Maximum   (ns)  |       38,819  |         86,954  | 124.00% |
------------------------------------------------------------------------
| Range of Maximums  (ns)  |  372,728,148  |      2,242,717  |         |
------------------------------------------------------------------------
| Largest  Minimum   (ns)  |       88,623  |         27,588  | -68.87% |
------------------------------------------------------------------------
| Smallest Minimum   (ns)  |          326  |            447  |  37.12% |
------------------------------------------------------------------------
| Range of Minimums  (ns)  |       88,297  |         27,141  |         |
------------------------------------------------------------------------

Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Reported-by: Tim Murray <timmurray@google.com>
Reported-by: Wei Wang <wvw@google.com>
Tested-by: Kyle Lin <kylelin@google.com>
Tested-by: Chunwei Lu <chunweilu@google.com>
Tested-by: Lulu Wang <luluw@google.com>
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig    |  14 +++++
 kernel/rcu/rcu.h      |   5 ++
 kernel/rcu/tree.c     |  51 ++++++++++++++++--
 kernel/rcu/tree.h     |   5 ++
 kernel/rcu/tree_exp.h | 147 +++++++++++++++++++++++++++++++++++++++-----------
 5 files changed, 188 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index bf8e341e75b4..fd64a75823cb 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -195,6 +195,20 @@ config RCU_BOOST_DELAY
 
 	  Accept the default if unsure.
 
+config RCU_EXP_KTHREAD
+	bool "Perform RCU expedited work in a real-time kthread"
+	depends on RCU_BOOST && RCU_EXPERT
+	default !PREEMPT_RT && NR_CPUS <= 32
+	help
+	  Use this option to further reduce the latencies of expedited
+	  grace periods at the expense of being more disruptive.
+
+	  This option is disabled by default on PREEMPT_RT=y kernels which
+	  disable expedited grace periods after boot by unconditionally
+	  setting rcupdate.rcu_normal_after_boot=1.
+
+	  Accept the default if unsure.
+
 config RCU_NOCB_CPU
 	bool "Offload RCU callback processing from boot-selected CPUs"
 	depends on TREE_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 20f0300f6cb1..e27bf7d1e3a4 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void);
 void rcu_fwd_progress_check(unsigned long j);
 void rcu_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
 extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..763e45fdf49b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+	const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+	const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+	struct sched_param param = { .sched_priority = kthread_prio };
+
+	rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+	if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+		pr_err("Failed to create %s!\n", gp_kworker_name);
+		return;
+	}
+
+	rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+	if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+		pr_err("Failed to create %s!\n", par_gp_kworker_name);
+		kthread_destroy_worker(rcu_exp_gp_kworker);
+		return;
+	}
+
+	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+	sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+				   &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+	WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
 /*
  * Spawn the kthreads that handle RCU's grace periods.
  */
@@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void)
 	rcu_spawn_nocb_kthreads();
 	rcu_spawn_boost_kthreads();
 	rcu_spawn_core_kthreads();
+	/* Create kthread worker for expedited GPs */
+	rcu_start_exp_gp_kworkers();
 	return 0;
 }
 early_initcall(rcu_spawn_gp_kthread);
@@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void)
 }
 
 struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
 
 static void __init kfree_rcu_batch_init(void)
 {
@@ -4811,8 +4857,7 @@ void __init rcu_init(void)
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_gp_wq);
-	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
-	WARN_ON(!rcu_par_gp_wq);
+	rcu_alloc_par_gp_wq();
 
 	/* Fill in default value for rcutree.qovld boot parameter. */
 	/* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 926673ebe355..b577cdfdc851 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/cache.h>
+#include <linux/kthread.h>
 #include <linux/spinlock.h>
 #include <linux/rtmutex.h>
 #include <linux/threads.h>
@@ -23,7 +24,11 @@
 /* Communicate arguments to a workqueue handler. */
 struct rcu_exp_work {
 	unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+	struct kthread_work rew_work;
+#else
 	struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
 };
 
 /* RCU's kthread states for tracing. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index b1f52b59fa4b..0f70f62039a9 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -334,15 +334,13 @@ fastpath:
  * Select the CPUs within the specified rcu_node that the upcoming
  * expedited grace period needs to wait for.
  */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
 {
 	int cpu;
 	unsigned long flags;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	int ret;
-	struct rcu_exp_work *rewp =
-		container_of(wp, struct rcu_exp_work, rew_work);
 	struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -417,13 +415,119 @@ retry_ipi:
 		rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
 }
 
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+	struct rcu_exp_work *rewp =
+		container_of(wp, struct rcu_exp_work, rew_work);
+
+	__sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+	return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+	kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+	/*
+	 * Use rcu_exp_par_gp_kworker, because flushing a work item from
+	 * another work item on the same kthread worker can result in
+	 * deadlock.
+	 */
+	kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+	kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+	struct rcu_exp_work *rewp;
+
+	rewp = container_of(wp, struct rcu_exp_work, rew_work);
+	rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+	kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+	kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+	struct rcu_exp_work *rewp =
+		container_of(wp, struct rcu_exp_work, rew_work);
+
+	__sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+	return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+	int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+	INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+	/* If all offline, queue the work on an unbound CPU. */
+	if (unlikely(cpu > rnp->grphi - rnp->grplo))
+		cpu = WORK_CPU_UNBOUND;
+	else
+		cpu += rnp->grplo;
+	queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+	flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+	struct rcu_exp_work *rewp;
+
+	rewp = container_of(wp, struct rcu_exp_work, rew_work);
+	rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+	INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+	queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+	destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
 /*
  * Select the nodes that the upcoming expedited grace period needs
  * to wait for.
  */
 static void sync_rcu_exp_select_cpus(void)
 {
-	int cpu;
 	struct rcu_node *rnp;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void)
 		rnp->exp_need_flush = false;
 		if (!READ_ONCE(rnp->expmask))
 			continue; /* Avoid early boot non-existent wq. */
-		if (!READ_ONCE(rcu_par_gp_wq) ||
+		if (!rcu_gp_par_worker_started() ||
 		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
 		    rcu_is_last_leaf_node(rnp)) {
-			/* No workqueues yet or last leaf, do direct call. */
+			/* No worker started yet or last leaf, do direct call. */
 			sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
 			continue;
 		}
-		INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
-		cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-		/* If all offline, queue the work on an unbound CPU. */
-		if (unlikely(cpu > rnp->grphi - rnp->grplo))
-			cpu = WORK_CPU_UNBOUND;
-		else
-			cpu += rnp->grplo;
-		queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+		sync_rcu_exp_select_cpus_queue_work(rnp);
 		rnp->exp_need_flush = true;
 	}
 
-	/* Wait for workqueue jobs (if any) to complete. */
+	/* Wait for jobs (if any) to complete. */
 	rcu_for_each_leaf_node(rnp)
 		if (rnp->exp_need_flush)
-			flush_work(&rnp->rew.rew_work);
+			sync_rcu_exp_select_cpus_flush_work(rnp);
 }
 
 /*
@@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
 	rcu_exp_wait_wake(s);
 }
 
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
-	struct rcu_exp_work *rewp;
-
-	rewp = container_of(wp, struct rcu_exp_work, rew_work);
-	rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
@@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void)
 	} else {
 		/* Marshall arguments & schedule the expedited grace period. */
 		rew.rew_s = s;
-		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
-		queue_work(rcu_gp_wq, &rew.rew_work);
+		synchronize_rcu_expedited_queue_work(&rew);
 	}
 
 	/* Wait for expedited grace period to complete. */
 	rnp = rcu_get_root();
 	wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
 		   sync_exp_work_done(s));
-	smp_mb(); /* Workqueue actions happen before return. */
+	smp_mb(); /* Work actions happen before return. */
 
 	/* Let the next expedited grace period start. */
 	mutex_unlock(&rcu_state.exp_mutex);
 
 	if (likely(!boottime))
-		destroy_work_on_stack(&rew.rew_work);
+		synchronize_rcu_expedited_destroy_work(&rew);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-- 
cgit v1.2.3


From 157cc18122b4a1456d19048e151a164216c4a704 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Apr 2022 09:48:54 -0500
Subject: signal: Rename send_signal send_signal_locked

Rename send_signal and __send_signal to send_signal_locked and
__send_signal_locked to make send_signal usable outside of
signal.c.

Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-1-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 30cd1ca43bcd..b0403197b0ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1071,8 +1071,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
-static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
-			enum pid_type type, bool force)
+static int __send_signal_locked(int sig, struct kernel_siginfo *info,
+				struct task_struct *t, enum pid_type type, bool force)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -1212,8 +1212,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
 	return ret;
 }
 
-static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
-			enum pid_type type)
+int send_signal_locked(int sig, struct kernel_siginfo *info,
+		       struct task_struct *t, enum pid_type type)
 {
 	/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
 	bool force = false;
@@ -1245,7 +1245,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct
 			force = true;
 		}
 	}
-	return __send_signal(sig, info, t, type, force);
+	return __send_signal_locked(sig, info, t, type, force);
 }
 
 static void print_fatal_signal(int signr)
@@ -1284,7 +1284,7 @@ __setup("print-fatal-signals=", setup_print_fatal_signals);
 int
 __group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
 {
-	return send_signal(sig, info, p, PIDTYPE_TGID);
+	return send_signal_locked(sig, info, p, PIDTYPE_TGID);
 }
 
 int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
@@ -1294,7 +1294,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
 	int ret = -ESRCH;
 
 	if (lock_task_sighand(p, &flags)) {
-		ret = send_signal(sig, info, p, type);
+		ret = send_signal_locked(sig, info, p, type);
 		unlock_task_sighand(p, &flags);
 	}
 
@@ -1347,7 +1347,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 	if (action->sa.sa_handler == SIG_DFL &&
 	    (!t->ptrace || (handler == HANDLER_EXIT)))
 		t->signal->flags &= ~SIGNAL_UNKILLABLE;
-	ret = send_signal(sig, info, t, PIDTYPE_PID);
+	ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
 	return ret;
@@ -1567,7 +1567,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
 
 	if (sig) {
 		if (lock_task_sighand(p, &flags)) {
-			ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
+			ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
 			unlock_task_sighand(p, &flags);
 		} else
 			ret = -ESRCH;
@@ -2103,7 +2103,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 * parent's namespaces.
 	 */
 	if (valid_signal(sig) && sig)
-		__send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
+		__send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
 
@@ -2601,7 +2601,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	/* If the (new) signal is now blocked, requeue it.  */
 	if (sigismember(&current->blocked, signr) ||
 	    fatal_signal_pending(current)) {
-		send_signal(signr, info, current, type);
+		send_signal_locked(signr, info, current, type);
 		signr = 0;
 	}
 
@@ -4793,7 +4793,7 @@ void kdb_send_sig(struct task_struct *t, int sig)
 			   "the deadlock.\n");
 		return;
 	}
-	ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
+	ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
 	spin_unlock(&t->sighand->siglock);
 	if (ret)
 		kdb_printf("Fail to deliver Signal %d to process %d.\n",
-- 
cgit v1.2.3


From e71ba124078e391879e0bf111529fa2d630d106c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 Apr 2022 09:28:50 -0500
Subject: signal: Replace __group_send_sig_info with send_signal_locked

The function __group_send_sig_info is just a light wrapper around
send_signal_locked with one parameter fixed to a constant value.  As
the wrapper adds no real value update the code to directly call the
wrapped function.

Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-2-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c                | 8 +-------
 kernel/time/posix-cpu-timers.c | 6 +++---
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b0403197b0ad..72d96614effc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1281,12 +1281,6 @@ static int __init setup_print_fatal_signals(char *str)
 
 __setup("print-fatal-signals=", setup_print_fatal_signals);
 
-int
-__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
-{
-	return send_signal_locked(sig, info, p, PIDTYPE_TGID);
-}
-
 int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
 			enum pid_type type)
 {
@@ -2173,7 +2167,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
 	spin_lock_irqsave(&sighand->siglock, flags);
 	if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
 	    !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
-		__group_send_sig_info(SIGCHLD, &info, parent);
+		send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
 	/*
 	 * Even if SIGCHLD is not generated, we must wake up wait4 calls.
 	 */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0a97193984db..cb925e8ef9a8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -870,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk)
 {
 	if (tsk->dl.dl_overrun) {
 		tsk->dl.dl_overrun = 0;
-		__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+		send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
 	}
 }
 
@@ -884,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard)
 			rt ? "RT" : "CPU", hard ? "hard" : "soft",
 			current->comm, task_pid_nr(current));
 	}
-	__group_send_sig_info(signo, SEND_SIG_PRIV, current);
+	send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID);
 	return true;
 }
 
@@ -958,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 		trace_itimer_expire(signo == SIGPROF ?
 				    ITIMER_PROF : ITIMER_VIRTUAL,
 				    task_tgid(tsk), cur_time);
-		__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+		send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID);
 	}
 
 	if (it->expires && it->expires < *expires)
-- 
cgit v1.2.3


From 16cc1bc67de88be19fa595f4645506ea2ac106d2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Apr 2022 11:09:11 -0500
Subject: ptrace: Remove arch_ptrace_attach

The last remaining implementation of arch_ptrace_attach is ia64's
ptrace_attach_sync_user_rbs which was added at the end of 2007 in
commit aa91a2e90044 ("[IA64] Synchronize RBS on PTRACE_ATTACH").

Reading the comments and examining the code ptrace_attach_sync_user_rbs
has the sole purpose of saving registers to the stack when ptrace_attach
changes TASK_STOPPED to TASK_TRACED.  In all other cases arch_ptrace_stop
takes care of the register saving.

In commit d79fdd6d96f4 ("ptrace: Clean transitions between TASK_STOPPED and TRACED")
modified ptrace_attach to wake up the thread and enter ptrace_stop normally even
when the thread starts out stopped.

This makes ptrace_attach_sync_user_rbs completely unnecessary.  So just
remove it.

I read through the code to verify that ptrace_attach_sync_user_rbs is
unnecessary.  What I found is that the code is quite dead.

Reading ptrace_attach_sync_user_rbs it is easy to see that the it does
nothing unless __state == TASK_STOPPED.

Calling arch_ptrace_attach (aka ptrace_attach_sync_user_rbs) after
ptrace_traceme it is easy to see that because we are talking about the
current process the value of __state is TASK_RUNNING.  Which means
ptrace_attach_sync_user_rbs does nothing.

The only other call of arch_ptrace_attach (aka
ptrace_attach_sync_user_rbs) is after ptrace_attach.

If the task is running (and PTRACE_SEIZE is not specified), a SIGSTOP
is sent which results in do_signal_stop setting JOBCTL_TRAP_STOP on
the target task (as it is ptraced) and the target task stopping
in ptrace_stop with __state == TASK_TRACED.

If the task was already stopped then ptrace_attach sets
JOBCTL_TRAPPING and JOBCTL_TRAP_STOP, wakes it out of __TASK_STOPPED,
and waits until the JOBCTL_TRAPPING_BIT is clear.  At which point
the task stops in ptrace_stop.

In both cases there are a couple of funning excpetions such as if the
traced task receiveds a SIGCONT, or is set a fatal signal.

However in all of those cases the tracee never stops in __state
TASK_STOPPED.  Which is a long way of saying that ptrace_attach_sync_user_rbs
is guaranteed never to do anything.

Cc: linux-ia64@vger.kernel.org
Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-4-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ccc4b465775b..da30dcd477a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1285,10 +1285,6 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-#ifndef arch_ptrace_attach
-#define arch_ptrace_attach(child)	do { } while (0)
-#endif
-
 SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 		unsigned long, data)
 {
@@ -1297,8 +1293,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
-		if (!ret)
-			arch_ptrace_attach(current);
 		goto out;
 	}
 
@@ -1310,12 +1304,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
 		ret = ptrace_attach(child, request, addr, data);
-		/*
-		 * Some architectures need to do book-keeping after
-		 * a ptrace attach.
-		 */
-		if (!ret)
-			arch_ptrace_attach(child);
 		goto out_put_task_struct;
 	}
 
@@ -1455,12 +1443,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
 
 	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
 		ret = ptrace_attach(child, request, addr, data);
-		/*
-		 * Some architectures need to do book-keeping after
-		 * a ptrace attach.
-		 */
-		if (!ret)
-			arch_ptrace_attach(child);
 		goto out_put_task_struct;
 	}
 
-- 
cgit v1.2.3


From cb3c19c93d656caa6fe63d6277aabd7e570f1d03 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Apr 2022 09:16:10 -0500
Subject: signal: Use lockdep_assert_held instead of assert_spin_locked

The distinction is that assert_spin_locked() checks if the lock is
held *by*anyone* whereas lockdep_assert_held() asserts the current
context holds the lock.  Also, the check goes away if you build
without lockdep.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/Ympr/+PX4XgT/UKU@hirez.programming.kicks-ass.net
Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-6-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 72d96614effc..3fd2ce133387 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -884,7 +884,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
 static void ptrace_trap_notify(struct task_struct *t)
 {
 	WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
-	assert_spin_locked(&t->sighand->siglock);
+	lockdep_assert_held(&t->sighand->siglock);
 
 	task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
 	ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
@@ -1079,7 +1079,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info,
 	int override_rlimit;
 	int ret = 0, result;
 
-	assert_spin_locked(&t->sighand->siglock);
+	lockdep_assert_held(&t->sighand->siglock);
 
 	result = TRACE_SIGNAL_IGNORED;
 	if (!prepare_signal(sig, t, force))
-- 
cgit v1.2.3


From 6a2d90ba027adba528509ffa27097cffd3879257 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Apr 2022 09:23:55 -0500
Subject: ptrace: Reimplement PTRACE_KILL by always sending SIGKILL

The current implementation of PTRACE_KILL is buggy and has been for
many years as it assumes it's target has stopped in ptrace_stop.  At a
quick skim it looks like this assumption has existed since ptrace
support was added in linux v1.0.

While PTRACE_KILL has been deprecated we can not remove it as
a quick search with google code search reveals many existing
programs calling it.

When the ptracee is not stopped at ptrace_stop some fields would be
set that are ignored except in ptrace_stop.  Making the userspace
visible behavior of PTRACE_KILL a noop in those case.

As the usual rules are not obeyed it is not clear what the
consequences are of calling PTRACE_KILL on a running process.
Presumably userspace does not do this as it achieves nothing.

Replace the implementation of PTRACE_KILL with a simple
send_sig_info(SIGKILL) followed by a return 0.  This changes the
observable user space behavior only in that PTRACE_KILL on a process
not stopped in ptrace_stop will also kill it.  As that has always
been the intent of the code this seems like a reasonable change.

Cc: stable@vger.kernel.org
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-7-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index da30dcd477a0..7105821595bc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1236,9 +1236,8 @@ int ptrace_request(struct task_struct *child, long request,
 		return ptrace_resume(child, request, data);
 
 	case PTRACE_KILL:
-		if (child->exit_state)	/* already dead */
-			return 0;
-		return ptrace_resume(child, request, SIGKILL);
+		send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
+		return 0;
 
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	case PTRACE_GETREGSET:
-- 
cgit v1.2.3


From 7b0fe1367ef2d2591c20f03c4e64b7230e1ebcd7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 5 May 2022 12:25:36 -0500
Subject: ptrace: Document that wait_task_inactive can't fail

After ptrace_freeze_traced succeeds it is known that the tracee
has a __state value of __TASK_TRACED and that no __ptrace_unlink will
happen because the tracer is waiting for the tracee, and the tracee is
in ptrace_stop.

The function ptrace_freeze_traced can succeed at any point after
ptrace_stop has set TASK_TRACED and dropped siglock.  The read_lock on
tasklist_lock only excludes ptrace_attach.

This means that the !current->ptrace which executes under a read_lock
of tasklist_lock will never see a ptrace_freeze_trace as the tracer
must have gone away before the tasklist_lock was taken and
ptrace_attach can not occur until the read_lock is dropped.  As
ptrace_freeze_traced depends upon ptrace_attach running before it can
run that excludes ptrace_freeze_traced until __state is set to
TASK_RUNNING.  This means that task_is_traced will fail in
ptrace_freeze_attach and ptrace_freeze_attached will fail.

On the current->ptrace branch of ptrace_stop which will be reached any
time after ptrace_freeze_traced has succeed it is known that __state
is __TASK_TRACED and schedule() will be called with that state.

Use a WARN_ON_ONCE to document that wait_task_inactive(TASK_TRACED)
should never fail.  Remove the stale comment about may_ptrace_stop.

Strictly speaking this is not true because if PREEMPT_RT is enabled
wait_task_inactive can fail because __state can be changed.  I don't
see this as a problem as the ptrace code is currently broken on
PREMPT_RT, and this is one of the issues.  Failing and warning when
the assumptions of the code are broken is good.

Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-8-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7105821595bc..05953ac9f7bd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,17 +266,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	}
 	read_unlock(&tasklist_lock);
 
-	if (!ret && !ignore_state) {
-		if (!wait_task_inactive(child, __TASK_TRACED)) {
-			/*
-			 * This can only happen if may_ptrace_stop() fails and
-			 * ptrace_stop() changes ->state back to TASK_RUNNING,
-			 * so we should not worry about leaking __TASK_TRACED.
-			 */
-			WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
-			ret = -ESRCH;
-		}
-	}
+	if (!ret && !ignore_state &&
+	    WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED)))
+		ret = -ESRCH;
 
 	return ret;
 }
-- 
cgit v1.2.3


From 57b6de08b5f6586851c2261ef0cc16cd275615e7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 4 May 2022 13:39:58 -0500
Subject: ptrace: Admit ptrace_stop can generate spuriuos SIGTRAPs

Long ago and far away there was a BUG_ON at the start of ptrace_stop
that did "BUG_ON(!(current->ptrace & PT_PTRACED));" [1].  The BUG_ON
had never triggered but examination of the code showed that the BUG_ON
could actually trigger.  To complement removing the BUG_ON an attempt
to better handle the race was added.

The code detected the tracer had gone away and did not call
do_notify_parent_cldstop.  The code also attempted to prevent
ptrace_report_syscall from sending spurious SIGTRAPs when the tracer
went away.

The code to detect when the tracer had gone away before sending a
signal to tracer was a legitimate fix and continues to work to this
date.

The code to prevent sending spurious SIGTRAPs is a failure.  At the
time and until today the code only catches it when the tracer goes
away after siglock is dropped and before read_lock is acquired.  If
the tracer goes away after read_lock is dropped a spurious SIGTRAP can
still be sent to the tracee.  The tracer going away after read_lock
is dropped is the far likelier case as it is the bigger window.

Given that the attempt to prevent the generation of a SIGTRAP was a
failure and continues to be a failure remove the code that attempts to
do that.  This simplifies the code in ptrace_stop and makes
ptrace_stop much easier to reason about.

To successfully deal with the tracer going away, all of the tracer's
instrumentation of the child would need to be removed, and reliably
detecting when the tracer has set a signal to continue with would need
to be implemented.

[1] 66519f549ae5 ("[PATCH] fix ptracer death race yielding bogus BUG_ON")
History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-9-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/signal.c | 92 ++++++++++++++++++++++++---------------------------------
 1 file changed, 38 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 3fd2ce133387..d2d0c753156c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2187,13 +2187,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * with.  If the code did not stop because the tracer is gone,
  * the stop signal remains unchanged unless clear_code.
  */
-static int ptrace_stop(int exit_code, int why, int clear_code,
-			unsigned long message, kernel_siginfo_t *info)
+static int ptrace_stop(int exit_code, int why, unsigned long message,
+		       kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
 	bool gstop_done = false;
-	bool read_code = true;
 
 	if (arch_ptrace_stop_needed()) {
 		/*
@@ -2212,7 +2211,14 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
 	/*
 	 * schedule() will not sleep if there is a pending signal that
 	 * can awaken the task.
+	 *
+	 * After this point ptrace_signal_wake_up will clear TASK_TRACED
+	 * if ptrace_unlink happens.  Handle previous ptrace_unlinks
+	 * here to prevent ptrace_stop sleeping in schedule.
 	 */
+	if (!current->ptrace)
+		return exit_code;
+
 	set_special_state(TASK_TRACED);
 
 	/*
@@ -2259,54 +2265,33 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
 
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
-	if (likely(current->ptrace)) {
-		/*
-		 * Notify parents of the stop.
-		 *
-		 * While ptraced, there are two parents - the ptracer and
-		 * the real_parent of the group_leader.  The ptracer should
-		 * know about every stop while the real parent is only
-		 * interested in the completion of group stop.  The states
-		 * for the two don't interact with each other.  Notify
-		 * separately unless they're gonna be duplicates.
-		 */
+	/*
+	 * Notify parents of the stop.
+	 *
+	 * While ptraced, there are two parents - the ptracer and
+	 * the real_parent of the group_leader.  The ptracer should
+	 * know about every stop while the real parent is only
+	 * interested in the completion of group stop.  The states
+	 * for the two don't interact with each other.  Notify
+	 * separately unless they're gonna be duplicates.
+	 */
+	if (current->ptrace)
 		do_notify_parent_cldstop(current, true, why);
-		if (gstop_done && ptrace_reparented(current))
-			do_notify_parent_cldstop(current, false, why);
+	if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
+		do_notify_parent_cldstop(current, false, why);
 
-		/*
-		 * Don't want to allow preemption here, because
-		 * sys_ptrace() needs this task to be inactive.
-		 *
-		 * XXX: implement read_unlock_no_resched().
-		 */
-		preempt_disable();
-		read_unlock(&tasklist_lock);
-		cgroup_enter_frozen();
-		preempt_enable_no_resched();
-		freezable_schedule();
-		cgroup_leave_frozen(true);
-	} else {
-		/*
-		 * By the time we got the lock, our tracer went away.
-		 * Don't drop the lock yet, another tracer may come.
-		 *
-		 * If @gstop_done, the ptracer went away between group stop
-		 * completion and here.  During detach, it would have set
-		 * JOBCTL_STOP_PENDING on us and we'll re-enter
-		 * TASK_STOPPED in do_signal_stop() on return, so notifying
-		 * the real parent of the group stop completion is enough.
-		 */
-		if (gstop_done)
-			do_notify_parent_cldstop(current, false, why);
-
-		/* tasklist protects us from ptrace_freeze_traced() */
-		__set_current_state(TASK_RUNNING);
-		read_code = false;
-		if (clear_code)
-			exit_code = 0;
-		read_unlock(&tasklist_lock);
-	}
+	/*
+	 * Don't want to allow preemption here, because
+	 * sys_ptrace() needs this task to be inactive.
+	 *
+	 * XXX: implement read_unlock_no_resched().
+	 */
+	preempt_disable();
+	read_unlock(&tasklist_lock);
+	cgroup_enter_frozen();
+	preempt_enable_no_resched();
+	freezable_schedule();
+	cgroup_leave_frozen(true);
 
 	/*
 	 * We are back.  Now reacquire the siglock before touching
@@ -2314,8 +2299,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
 	 * any signal-sending on another CPU that wants to examine it.
 	 */
 	spin_lock_irq(&current->sighand->siglock);
-	if (read_code)
-		exit_code = current->exit_code;
+	exit_code = current->exit_code;
 	current->last_siginfo = NULL;
 	current->ptrace_message = 0;
 	current->exit_code = 0;
@@ -2343,7 +2327,7 @@ static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long mes
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	return ptrace_stop(exit_code, why, 1, message, &info);
+	return ptrace_stop(exit_code, why, message, &info);
 }
 
 int ptrace_notify(int exit_code, unsigned long message)
@@ -2515,7 +2499,7 @@ static void do_jobctl_trap(void)
 				 CLD_STOPPED, 0);
 	} else {
 		WARN_ON_ONCE(!signr);
-		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
+		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
 	}
 }
 
@@ -2568,7 +2552,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
+	signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
 	if (signr == 0)
-- 
cgit v1.2.3


From 2500ad1c7fa42ad734677853961a3a8bec0772c5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Apr 2022 08:43:34 -0500
Subject: ptrace: Don't change __state

Stop playing with tsk->__state to remove TASK_WAKEKILL while a ptrace
command is executing.

Instead remove TASK_WAKEKILL from the definition of TASK_TRACED, and
implement a new jobctl flag TASK_PTRACE_FROZEN.  This new flag is set
in jobctl_freeze_task and cleared when ptrace_stop is awoken or in
jobctl_unfreeze_task (when ptrace_stop remains asleep).

In signal_wake_up add __TASK_TRACED to state along with TASK_WAKEKILL
when the wake up is for a fatal signal.  Skip adding __TASK_TRACED
when TASK_PTRACE_FROZEN is not set.  This has the same effect as
changing TASK_TRACED to __TASK_TRACED as all of the wake_ups that use
TASK_KILLABLE go through signal_wake_up.

Handle a ptrace_stop being called with a pending fatal signal.
Previously it would have been handled by schedule simply failing to
sleep.  As TASK_WAKEKILL is no longer part of TASK_TRACED schedule
will sleep with a fatal_signal_pending.   The code in signal_wake_up
guarantees that the code will be awaked by any fatal signal that
codes after TASK_TRACED is set.

Previously the __state value of __TASK_TRACED was changed to
TASK_RUNNING when woken up or back to TASK_TRACED when the code was
left in ptrace_stop.  Now when woken up ptrace_stop now clears
JOBCTL_PTRACE_FROZEN and when left sleeping ptrace_unfreezed_traced
clears JOBCTL_PTRACE_FROZEN.

Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-10-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c     | 21 ++++++++-------------
 kernel/sched/core.c |  5 +----
 kernel/signal.c     | 14 ++++++--------
 3 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 05953ac9f7bd..83ed28262708 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -197,7 +197,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 	spin_lock_irq(&task->sighand->siglock);
 	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
 	    !__fatal_signal_pending(task)) {
-		WRITE_ONCE(task->__state, __TASK_TRACED);
+		task->jobctl |= JOBCTL_PTRACE_FROZEN;
 		ret = true;
 	}
 	spin_unlock_irq(&task->sighand->siglock);
@@ -207,23 +207,19 @@ static bool ptrace_freeze_traced(struct task_struct *task)
 
 static void ptrace_unfreeze_traced(struct task_struct *task)
 {
-	if (READ_ONCE(task->__state) != __TASK_TRACED)
-		return;
-
-	WARN_ON(!task->ptrace || task->parent != current);
+	unsigned long flags;
 
 	/*
-	 * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
-	 * Recheck state under the lock to close this race.
+	 * The child may be awake and may have cleared
+	 * JOBCTL_PTRACE_FROZEN (see ptrace_resume).  The child will
+	 * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
 	 */
-	spin_lock_irq(&task->sighand->siglock);
-	if (READ_ONCE(task->__state) == __TASK_TRACED) {
+	if (lock_task_sighand(task, &flags)) {
+		task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
 		if (__fatal_signal_pending(task))
 			wake_up_state(task, __TASK_TRACED);
-		else
-			WRITE_ONCE(task->__state, TASK_TRACED);
+		unlock_task_sighand(task, &flags);
 	}
-	spin_unlock_irq(&task->sighand->siglock);
 }
 
 /**
@@ -256,7 +252,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 	 */
 	read_lock(&tasklist_lock);
 	if (child->ptrace && child->parent == current) {
-		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
 		/*
 		 * child->sighand can't be NULL, release_task()
 		 * does ptrace_unlink() before __exit_signal().
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..3c351707e830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6304,10 +6304,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 
 	/*
 	 * We must load prev->state once (task_struct::state is volatile), such
-	 * that:
-	 *
-	 *  - we form a control dependency vs deactivate_task() below.
-	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
+	 * that we form a control dependency vs deactivate_task() below.
 	 */
 	prev_state = READ_ONCE(prev->__state);
 	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
diff --git a/kernel/signal.c b/kernel/signal.c
index d2d0c753156c..a58b68a2d3c6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2209,14 +2209,12 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 	}
 
 	/*
-	 * schedule() will not sleep if there is a pending signal that
-	 * can awaken the task.
-	 *
-	 * After this point ptrace_signal_wake_up will clear TASK_TRACED
-	 * if ptrace_unlink happens.  Handle previous ptrace_unlinks
-	 * here to prevent ptrace_stop sleeping in schedule.
+	 * After this point ptrace_signal_wake_up or signal_wake_up
+	 * will clear TASK_TRACED if ptrace_unlink happens or a fatal
+	 * signal comes in.  Handle previous ptrace_unlinks and fatal
+	 * signals here to prevent ptrace_stop sleeping in schedule.
 	 */
-	if (!current->ptrace)
+	if (!current->ptrace || __fatal_signal_pending(current))
 		return exit_code;
 
 	set_special_state(TASK_TRACED);
@@ -2305,7 +2303,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 	current->exit_code = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
-	current->jobctl &= ~JOBCTL_LISTENING;
+	current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);
 
 	/*
 	 * Queued signals ignored us while we were stopped for tracing.
-- 
cgit v1.2.3


From 5b4197cb287daf3cfd008fbf8682a1d6f4b13c0b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Apr 2022 10:50:17 -0500
Subject: ptrace: Always take siglock in ptrace_resume

Make code analysis simpler and future changes easier by
always taking siglock in ptrace_resume.

Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-11-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/ptrace.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83ed28262708..36a5b7a00d2f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -837,8 +837,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
 static int ptrace_resume(struct task_struct *child, long request,
 			 unsigned long data)
 {
-	bool need_siglock;
-
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -874,18 +872,11 @@ static int ptrace_resume(struct task_struct *child, long request,
 	 * Note that we need siglock even if ->exit_code == data and/or this
 	 * status was not reported yet, the new status must not be cleared by
 	 * wait_task_stopped() after resume.
-	 *
-	 * If data == 0 we do not care if wait_task_stopped() reports the old
-	 * status and clears the code too; this can't race with the tracee, it
-	 * takes siglock after resume.
 	 */
-	need_siglock = data && !thread_group_empty(current);
-	if (need_siglock)
-		spin_lock_irq(&child->sighand->siglock);
+	spin_lock_irq(&child->sighand->siglock);
 	child->exit_code = data;
 	wake_up_state(child, __TASK_TRACED);
-	if (need_siglock)
-		spin_unlock_irq(&child->sighand->siglock);
+	spin_unlock_irq(&child->sighand->siglock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 31cae1eaae4fd65095ad6a3659db467bc3c2599e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 3 May 2022 15:57:47 -0500
Subject: sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state

Currently ptrace_stop() / do_signal_stop() rely on the special states
TASK_TRACED and TASK_STOPPED resp. to keep unique state. That is, this
state exists only in task->__state and nowhere else.

There's two spots of bother with this:

 - PREEMPT_RT has task->saved_state which complicates matters,
   meaning task_is_{traced,stopped}() needs to check an additional
   variable.

 - An alternative freezer implementation that itself relies on a
   special TASK state would loose TASK_TRACED/TASK_STOPPED and will
   result in misbehaviour.

As such, add additional state to task->jobctl to track this state
outside of task->__state.

NOTE: this doesn't actually fix anything yet, just adds extra state.

--EWB
  * didn't add a unnecessary newline in signal.h
  * Update t->jobctl in signal_wake_up and ptrace_signal_wake_up
    instead of in signal_wake_up_state.  This prevents the clearing
    of TASK_STOPPED and TASK_TRACED from getting lost.
  * Added warnings if JOBCTL_STOPPED or JOBCTL_TRACED are not cleared

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220421150654.757693825@infradead.org
Tested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20220505182645.497868-12-ebiederm@xmission.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/ptrace.c | 16 +++++++++++++---
 kernel/signal.c | 10 ++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 36a5b7a00d2f..328a34a99124 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
 	return true;
 }
 
-/* Ensure that nothing can wake it up, even SIGKILL */
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
 static bool ptrace_freeze_traced(struct task_struct *task)
 {
 	bool ret = false;
@@ -216,8 +221,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
 	 */
 	if (lock_task_sighand(task, &flags)) {
 		task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
-		if (__fatal_signal_pending(task))
+		if (__fatal_signal_pending(task)) {
+			task->jobctl &= ~TASK_TRACED;
 			wake_up_state(task, __TASK_TRACED);
+		}
 		unlock_task_sighand(task, &flags);
 	}
 }
@@ -462,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 	 * in and out of STOPPED are protected by siglock.
 	 */
 	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_STOPPED;
 		signal_wake_up_state(task, __TASK_STOPPED);
+	}
 
 	spin_unlock(&task->sighand->siglock);
 
@@ -875,6 +884,7 @@ static int ptrace_resume(struct task_struct *child, long request,
 	 */
 	spin_lock_irq(&child->sighand->siglock);
 	child->exit_code = data;
+	child->jobctl &= ~JOBCTL_TRACED;
 	wake_up_state(child, __TASK_TRACED);
 	spin_unlock_irq(&child->sighand->siglock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index a58b68a2d3c6..e782c2611b64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -762,7 +762,10 @@ still_pending:
  */
 void signal_wake_up_state(struct task_struct *t, unsigned int state)
 {
+	lockdep_assert_held(&t->sighand->siglock);
+
 	set_tsk_thread_flag(t, TIF_SIGPENDING);
+
 	/*
 	 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
 	 * case. We don't check t->state here because there is a race with it
@@ -930,9 +933,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
 		for_each_thread(p, t) {
 			flush_sigqueue_mask(&flush, &t->pending);
 			task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
-			if (likely(!(t->ptrace & PT_SEIZED)))
+			if (likely(!(t->ptrace & PT_SEIZED))) {
+				t->jobctl &= ~JOBCTL_STOPPED;
 				wake_up_state(t, __TASK_STOPPED);
-			else
+			} else
 				ptrace_trap_notify(t);
 		}
 
@@ -2218,6 +2222,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 		return exit_code;
 
 	set_special_state(TASK_TRACED);
+	current->jobctl |= JOBCTL_TRACED;
 
 	/*
 	 * We're committing to trapping.  TRACED should be visible before
@@ -2436,6 +2441,7 @@ static bool do_signal_stop(int signr)
 		if (task_participate_group_stop(current))
 			notify = CLD_STOPPED;
 
+		current->jobctl |= JOBCTL_STOPPED;
 		set_special_state(TASK_STOPPED);
 		spin_unlock_irq(&current->sighand->siglock);
 
-- 
cgit v1.2.3


From 9c2136be0878c88c53dea26943ce40bb03ad8d8d Mon Sep 17 00:00:00 2001
From: Delyan Kratunov <delyank@fb.com>
Date: Wed, 11 May 2022 18:28:36 +0000
Subject: sched/tracing: Append prev_state to tp args instead

Commit fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting
sched_switch event, 2022-01-20) added a new prev_state argument to the
sched_switch tracepoint, before the prev task_struct pointer.

This reordering of arguments broke BPF programs that use the raw
tracepoint (e.g. tp_btf programs). The type of the second argument has
changed and existing programs that assume a task_struct* argument
(e.g. for bpf_task_storage access) will now fail to verify.

If we instead append the new argument to the end, all existing programs
would continue to work and can conditionally extract the prev_state
argument on supported kernel versions.

Fixes: fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting sched_switch event, 2022-01-20)
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/c8a6930dfdd58a4a5755fc01732675472979732b.camel@fb.com
---
 kernel/sched/core.c               | 2 +-
 kernel/trace/fgraph.c             | 4 ++--
 kernel/trace/ftrace.c             | 4 ++--
 kernel/trace/trace_events.c       | 8 ++++----
 kernel/trace/trace_osnoise.c      | 4 ++--
 kernel/trace/trace_sched_switch.c | 4 ++--
 kernel/trace/trace_sched_wakeup.c | 4 ++--
 7 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 51efaabac3e4..d58c0389eb23 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6382,7 +6382,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8f4fb328133a..a7e84c8543cb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -404,9 +404,9 @@ free:
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-				unsigned int prev_state,
 				struct task_struct *prev,
-				struct task_struct *next)
+				struct task_struct *next,
+				unsigned int prev_state)
 {
 	unsigned long long timestamp;
 	int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4f1d2f5e7263..af899b058c8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-				     unsigned int prev_state,
 				     struct task_struct *prev,
-				     struct task_struct *next)
+				     struct task_struct *next,
+				     unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e11e167b7809..f97de82d1342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-					unsigned int prev_state,
 					struct task_struct *prev,
-					struct task_struct *next)
+					struct task_struct *next,
+					unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
@@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-					 unsigned int prev_state,
 					 struct task_struct *prev,
-					 struct task_struct *next)
+					 struct task_struct *next,
+					 unsigned int prev_state)
 {
 	struct trace_array *tr = data;
 	struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e9ae1f33a7f0..afb92e2f0aea 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  */
 static void
 trace_sched_switch_callback(void *data, bool preempt,
-			    unsigned int prev_state,
 			    struct task_struct *p,
-			    struct task_struct *n)
+			    struct task_struct *n,
+			    unsigned int prev_state)
 {
 	struct osnoise_variables *osn_var = this_cpu_osn_var();
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 45796d8bd4b2..c9ffdcfe622e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
-		   unsigned int prev_state,
-		   struct task_struct *prev, struct task_struct *next)
+		   struct task_struct *prev, struct task_struct *next,
+		   unsigned int prev_state)
 {
 	int flags;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 46429f9a96fa..330aee1c1a49 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
-			  unsigned int prev_state,
-			  struct task_struct *prev, struct task_struct *next)
+			  struct task_struct *prev, struct task_struct *next,
+			  unsigned int prev_state)
 {
 	struct trace_array_cpu *data;
 	u64 T0, T1, delta;
-- 
cgit v1.2.3


From 07343110b293456d30393e89b86c4dee1ac051c8 Mon Sep 17 00:00:00 2001
From: Feng Zhou <zhoufeng.zf@bytedance.com>
Date: Wed, 11 May 2022 17:38:53 +0800
Subject: bpf: add bpf_map_lookup_percpu_elem for percpu map

Add new ebpf helpers bpf_map_lookup_percpu_elem.

The implementation method is relatively simple, refer to the implementation
method of map_lookup_elem of percpu map, increase the parameters of cpu, and
obtain it according to the specified cpu.

Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Link: https://lore.kernel.org/r/20220511093854.411-2-zhoufeng.zf@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c    | 15 +++++++++++++++
 kernel/bpf/core.c        |  1 +
 kernel/bpf/hashtab.c     | 32 ++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c     | 18 ++++++++++++++++++
 kernel/bpf/verifier.c    | 17 +++++++++++++++--
 kernel/trace/bpf_trace.c |  2 ++
 6 files changed, 83 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 724613da6576..fe40d3b9458f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -243,6 +243,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
 }
 
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -725,6 +739,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_lookup_elem = percpu_array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
+	.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
 	.map_seq_show_elem = percpu_array_map_seq_show_elem,
 	.map_check_btf = array_map_check_btf,
 	.map_lookup_batch = generic_map_lookup_batch,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..76f68d0a7ae8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2619,6 +2619,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_map_push_elem_proto __weak;
 const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
 const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
 const struct bpf_func_proto bpf_spin_lock_proto __weak;
 const struct bpf_func_proto bpf_spin_unlock_proto __weak;
 const struct bpf_func_proto bpf_jiffies64_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 705841279d16..17fb69c0e0dc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -2199,6 +2199,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 		return NULL;
 }
 
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct htab_elem *l;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	l = __htab_map_lookup_elem(map, key);
+	if (l)
+		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+	else
+		return NULL;
+}
+
 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
@@ -2211,6 +2225,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 	return NULL;
 }
 
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+	struct htab_elem *l;
+
+	if (cpu >= nr_cpu_ids)
+		return NULL;
+
+	l = __htab_map_lookup_elem(map, key);
+	if (l) {
+		bpf_lru_node_set_ref(&l->lru_node);
+		return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+	}
+
+	return NULL;
+}
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
 	struct htab_elem *l;
@@ -2300,6 +2330,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_percpu_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
+	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
@@ -2318,6 +2349,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
 	.map_update_elem = htab_lru_percpu_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
+	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
 	.map_set_for_each_callback_args = map_set_for_each_callback_args,
 	.map_for_each_callback = bpf_for_each_hash_elem,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3e709fed5306..d5f104a39092 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -119,6 +119,22 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
 };
 
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+	.func		= bpf_map_lookup_percpu_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_KEY,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
@@ -1420,6 +1436,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_pop_elem_proto;
 	case BPF_FUNC_map_peek_elem:
 		return &bpf_map_peek_elem_proto;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		return &bpf_map_lookup_percpu_elem_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c27fee73a2cb..05c1b6656824 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6137,6 +6137,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
 			goto error;
 		break;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+		    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+		    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+			goto error;
+		break;
 	case BPF_FUNC_sk_storage_get:
 	case BPF_FUNC_sk_storage_delete:
 		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
@@ -6750,7 +6756,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_pop_elem &&
 	    func_id != BPF_FUNC_map_peek_elem &&
 	    func_id != BPF_FUNC_for_each_map_elem &&
-	    func_id != BPF_FUNC_redirect_map)
+	    func_id != BPF_FUNC_redirect_map &&
+	    func_id != BPF_FUNC_map_lookup_percpu_elem)
 		return 0;
 
 	if (map == NULL) {
@@ -13810,7 +13817,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		     insn->imm == BPF_FUNC_map_pop_elem    ||
 		     insn->imm == BPF_FUNC_map_peek_elem   ||
 		     insn->imm == BPF_FUNC_redirect_map    ||
-		     insn->imm == BPF_FUNC_for_each_map_elem)) {
+		     insn->imm == BPF_FUNC_for_each_map_elem ||
+		     insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
@@ -13859,6 +13867,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 					      bpf_callback_t callback_fn,
 					      void *callback_ctx,
 					      u64 flags))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+				     (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
 
 patch_map_ops_generic:
 			switch (insn->imm) {
@@ -13886,6 +13896,9 @@ patch_map_ops_generic:
 			case BPF_FUNC_for_each_map_elem:
 				insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
 				continue;
+			case BPF_FUNC_map_lookup_percpu_elem:
+				insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+				continue;
 			}
 
 			goto patch_call_imm;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2eaac094caf8..7141ca8a1c2d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1197,6 +1197,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_pop_elem_proto;
 	case BPF_FUNC_map_peek_elem:
 		return &bpf_map_peek_elem_proto;
+	case BPF_FUNC_map_lookup_percpu_elem:
+		return &bpf_map_lookup_percpu_elem_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_ktime_get_boot_ns:
-- 
cgit v1.2.3


From a2aa95b71c9bbec793b5c5fa50f0a80d882b3e8d Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sat, 30 Apr 2022 21:08:03 +0800
Subject: bpf: Fix potential array overflow in bpf_trampoline_get_progs()

The cnt value in the 'cnt >= BPF_MAX_TRAMP_PROGS' check does not
include BPF_TRAMP_MODIFY_RETURN bpf programs, so the number of
the attached BPF_TRAMP_MODIFY_RETURN bpf programs in a trampoline
can exceed BPF_MAX_TRAMP_PROGS.

When this happens, the assignment '*progs++ = aux->prog' in
bpf_trampoline_get_progs() will cause progs array overflow as the
progs field in the bpf_tramp_progs struct can only hold at most
BPF_MAX_TRAMP_PROGS bpf programs.

Fixes: 88fd9e5352fe ("bpf: Refactor trampoline update code")
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Link: https://lore.kernel.org/r/20220430130803.210624-1-ytcoode@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/trampoline.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 0e9b3aefc34a..93c7675f0c9e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -415,7 +415,7 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
 	enum bpf_tramp_prog_type kind;
 	struct bpf_tramp_link *link_exiting;
 	int err = 0;
-	int cnt;
+	int cnt = 0, i;
 
 	kind = bpf_attach_type_to_tramp(link->link.prog);
 	mutex_lock(&tr->mutex);
@@ -426,7 +426,10 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
 		err = -EBUSY;
 		goto out;
 	}
-	cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
+
+	for (i = 0; i < BPF_TRAMP_MAX; i++)
+		cnt += tr->progs_cnt[i];
+
 	if (kind == BPF_TRAMP_REPLACE) {
 		/* Cannot attach extension if fentry/fexit are in use. */
 		if (cnt) {
@@ -512,16 +515,19 @@ out:
 
 void bpf_trampoline_put(struct bpf_trampoline *tr)
 {
+	int i;
+
 	if (!tr)
 		return;
 	mutex_lock(&trampoline_mutex);
 	if (!refcount_dec_and_test(&tr->refcnt))
 		goto out;
 	WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
-	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
-		goto out;
-	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
-		goto out;
+
+	for (i = 0; i < BPF_TRAMP_MAX; i++)
+		if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+			goto out;
+
 	/* This code will be executed even when the last bpf_tramp_image
 	 * is alive. All progs are detached from the trampoline and the
 	 * trampoline image is patched with jmp into epilogue to skip
-- 
cgit v1.2.3


From 2760f5a415c3b86c6394738c6cff740c8b4ce664 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 6 May 2022 15:54:01 -0700
Subject: stop_machine: Add stop_core_cpuslocked() for per-core operations

Hardware core level testing features require near simultaneous execution
of WRMSR instructions on all threads of a core to initiate a test.

Provide a customized cut down version of stop_machine_cpuslocked() that
just operates on the threads of a single core.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220506225410.1652287-4-tony.luck@intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 kernel/stop_machine.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index cbc30271ea4d..0887b3af609e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -633,6 +633,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 }
 EXPORT_SYMBOL_GPL(stop_machine);
 
+#ifdef CONFIG_SCHED_SMT
+int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
+{
+	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+	struct multi_stop_data msdata = {
+		.fn = fn,
+		.data = data,
+		.num_threads = cpumask_weight(smt_mask),
+		.active_cpus = smt_mask,
+	};
+
+	lockdep_assert_cpus_held();
+
+	/* Set the initial state and stop all online cpus. */
+	set_state(&msdata, MULTI_STOP_PREPARE);
+	return stop_cpus(smt_mask, multi_cpu_stop, &msdata);
+}
+EXPORT_SYMBOL_GPL(stop_core_cpuslocked);
+#endif
+
 /**
  * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
  * @fn: the function to run
-- 
cgit v1.2.3


From c14e522bc76efed6e947cd0ab83a1fac7a7a3ec9 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 2 May 2022 21:51:03 +0100
Subject: module: Make module_flags_taint() accept a module's taints bitmap and
 usable outside core code

No functional change.

The purpose of this patch is to modify module_flags_taint() to accept
a module's taints bitmap as a parameter and modifies all users
accordingly. Furthermore, it is now possible to access a given
module's taint flags data outside of non-essential code yet does
remain for internal use only.

This is in preparation for module unload taint tracking support.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h | 1 +
 kernel/module/main.c     | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 3e23bef5884d..abbd1c5ef264 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -100,6 +100,7 @@ int cmp_name(const void *name, const void *sym);
 long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
 		       unsigned int section);
 char *module_flags(struct module *mod, char *buf);
+size_t module_flags_taint(unsigned long taints, char *buf);
 
 static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
 {
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 05a42d8fcd7a..7dbdd098b995 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -890,13 +890,13 @@ static inline int module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
-static size_t module_flags_taint(struct module *mod, char *buf)
+size_t module_flags_taint(unsigned long taints, char *buf)
 {
 	size_t l = 0;
 	int i;
 
 	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
-		if (taint_flags[i].module && test_bit(i, &mod->taints))
+		if (taint_flags[i].module && test_bit(i, &taints))
 			buf[l++] = taint_flags[i].c_true;
 	}
 
@@ -974,7 +974,7 @@ static ssize_t show_taint(struct module_attribute *mattr,
 {
 	size_t l;
 
-	l = module_flags_taint(mk->mod, buffer);
+	l = module_flags_taint(mk->mod->taints, buffer);
 	buffer[l++] = '\n';
 	return l;
 }
@@ -2993,7 +2993,7 @@ char *module_flags(struct module *mod, char *buf)
 	    mod->state == MODULE_STATE_GOING ||
 	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
-		bx += module_flags_taint(mod, buf + bx);
+		bx += module_flags_taint(mod->taints, buf + bx);
 		/* Show a - for module-is-being-unloaded */
 		if (mod->state == MODULE_STATE_GOING)
 			buf[bx++] = '-';
-- 
cgit v1.2.3


From 6fb0538d0121ffab770a505b183968d93466ad59 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 2 May 2022 21:51:04 +0100
Subject: module: Move module_assert_mutex_or_preempt() to internal.h

No functional change.

This patch migrates module_assert_mutex_or_preempt() to internal.h.
So, the aforementiond function can be used outside of main/or core
module code yet will remain restricted for internal use only.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/internal.h | 12 ++++++++++++
 kernel/module/main.c     | 11 -----------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index abbd1c5ef264..0bdf64c9dfb5 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/rculist.h>
+#include <linux/rcupdate.h>
 
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
@@ -102,6 +103,17 @@ long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
 char *module_flags(struct module *mod, char *buf);
 size_t module_flags_taint(unsigned long taints, char *buf);
 
+static inline void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+	if (unlikely(!debug_locks))
+		return;
+
+	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+		     !lockdep_is_held(&module_mutex));
+#endif
+}
+
 static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
 {
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7dbdd098b995..7a0484900320 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -118,17 +118,6 @@ static void mod_update_bounds(struct module *mod)
 #endif
 }
 
-static void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
-	if (unlikely(!debug_locks))
-		return;
-
-	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
-		!lockdep_is_held(&module_mutex));
-#endif
-}
-
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.2.3


From 99bd9956551b27cb6f5b445abaced7e13b9976cd Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 2 May 2022 21:52:52 +0100
Subject: module: Introduce module unload taint tracking

Currently, only the initial module that tainted the kernel is
recorded e.g. when an out-of-tree module is loaded.

The purpose of this patch is to allow the kernel to maintain a record of
each unloaded module that taints the kernel. So, in addition to
displaying a list of linked modules (see print_modules()) e.g. in the
event of a detected bad page, unloaded modules that carried a taint/or
taints are displayed too. A tainted module unload count is maintained.

The number of tracked modules is not fixed. This feature is disabled by
default.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/Makefile   |  1 +
 kernel/module/internal.h | 21 +++++++++++++++++
 kernel/module/main.c     |  5 ++++
 kernel/module/tracking.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 kernel/module/tracking.c

(limited to 'kernel')

diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index d1ca799c12e2..948efea81e85 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_PROC_FS) += procfs.o
 obj-$(CONFIG_SYSFS) += sysfs.o
 obj-$(CONFIG_KGDB_KDB) += kdb.o
 obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 0bdf64c9dfb5..bc5507ab8450 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -145,6 +145,27 @@ static inline bool set_livepatch_module(struct module *mod)
 #endif
 }
 
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+	struct list_head list;
+	char name[MODULE_NAME_LEN];
+	unsigned long taints;
+	u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+	return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
 #ifdef CONFIG_MODULE_DECOMPRESS
 int module_decompress(struct load_info *info, const void *buf, size_t size);
 void module_decompress_cleanup(struct load_info *info);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 7a0484900320..6c3b4a846645 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1190,6 +1190,9 @@ static void free_module(struct module *mod)
 	module_bug_cleanup(mod);
 	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
 	synchronize_rcu();
+	if (try_add_tainted_module(mod))
+		pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+		       mod->name);
 	mutex_unlock(&module_mutex);
 
 	/* Clean up CFI for the module. */
@@ -3125,6 +3128,8 @@ void print_modules(void)
 			continue;
 		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
 	}
+
+	print_unloaded_tainted_modules();
 	preempt_enable();
 	if (last_unloaded_module[0])
 		pr_cont(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..7f8133044d09
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+
+int try_add_tainted_module(struct module *mod)
+{
+	struct mod_unload_taint *mod_taint;
+
+	module_assert_mutex_or_preempt();
+
+	list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+				lockdep_is_held(&module_mutex)) {
+		if (!strcmp(mod_taint->name, mod->name) &&
+		    mod_taint->taints & mod->taints) {
+			mod_taint->count++;
+			goto out;
+		}
+	}
+
+	mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+	if (unlikely(!mod_taint))
+		return -ENOMEM;
+	strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+	mod_taint->taints = mod->taints;
+	list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+	mod_taint->count = 1;
+out:
+	return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+	struct mod_unload_taint *mod_taint;
+	char buf[MODULE_FLAGS_BUF_SIZE];
+
+	if (!list_empty(&unloaded_tainted_modules)) {
+		printk(KERN_DEFAULT "Unloaded tainted modules:");
+		list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+					list) {
+			size_t l;
+
+			l = module_flags_taint(mod_taint->taints, buf);
+			buf[l++] = '\0';
+			pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+				mod_taint->count);
+		}
+	}
+}
-- 
cgit v1.2.3


From 391e982bfa632b8315235d8be9c0a81374c6a19c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 4 May 2022 12:54:20 +0300
Subject: module: fix [e_shstrndx].sh_size=0 OOB access

It is trivial to craft a module to trigger OOB access in this line:

	if (info->secstrings[strhdr->sh_size - 1] != '\0') {

BUG: unable to handle page fault for address: ffffc90000aa0fff
PGD 100000067 P4D 100000067 PUD 100066067 PMD 10436f067 PTE 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 7 PID: 1215 Comm: insmod Not tainted 5.18.0-rc5-00007-g9bf578647087-dirty #10
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014
RIP: 0010:load_module+0x19b/0x2391

Fixes: ec2a29593c83 ("module: harden ELF info handling")
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
[rebased patch onto modules-next]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 6c3b4a846645..23432fabfde8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1726,6 +1726,10 @@ static int elf_validity_check(struct load_info *info)
 	 * strings in the section safe.
 	 */
 	info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+	if (strhdr->sh_size == 0) {
+		pr_err("empty section name table\n");
+		goto no_exec;
+	}
 	if (info->secstrings[strhdr->sh_size - 1] != '\0') {
 		pr_err("ELF Spec violation: section name table isn't null terminated\n");
 		goto no_exec;
-- 
cgit v1.2.3


From 8eac910a49347821cbafc770a319e00ccd69d58b Mon Sep 17 00:00:00 2001
From: Lecopzer Chen <lecopzer.chen@mediatek.com>
Date: Wed, 27 Apr 2022 15:36:06 +0800
Subject: module: show disallowed symbol name for inherit_taint()

The error log for inherit_taint() doesn't really help to find the
symbol which violates GPL rules.

For example,
if a module has 300 symbol and includes 50 disallowed symbols,
the log only shows the content below and we have no idea what symbol is.
    AAA: module using GPL-only symbols uses symbols from proprietary module BBB.

It's hard for user who doesn't really know how the symbol was parsing.

This patch add symbol name to tell the offending symbols explicitly.
    AAA: module using GPL-only symbols uses symbols SSS from proprietary module BBB.

Signed-off-by: Lecopzer Chen <lecopzer.chen@mediatek.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 23432fabfde8..ac0a7882899b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1038,20 +1038,20 @@ static int verify_namespace_is_imported(const struct load_info *info,
 	return 0;
 }
 
-static bool inherit_taint(struct module *mod, struct module *owner)
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
 {
 	if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
 		return true;
 
 	if (mod->using_gplonly_symbols) {
-		pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
-			mod->name, owner->name);
+		pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+			mod->name, name, owner->name);
 		return false;
 	}
 
 	if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
-		pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
-			mod->name, owner->name);
+		pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+			mod->name, name, owner->name);
 		set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
 	}
 	return true;
@@ -1083,7 +1083,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
 	if (fsa.license == GPL_ONLY)
 		mod->using_gplonly_symbols = true;
 
-	if (!inherit_taint(mod, fsa.owner)) {
+	if (!inherit_taint(mod, fsa.owner, name)) {
 		fsa.sym = NULL;
 		goto getname;
 	}
-- 
cgit v1.2.3


From c6eee9df57a6d9252bae93a9386d0d872798f5d5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 5 May 2022 12:52:10 +0900
Subject: module: do not pass opaque pointer for symbol search

There is no need to use an opaque pointer for check_exported_symbol()
or find_exported_symbol_in_section.

Pass (struct find_symbol_arg *) explicitly.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index ac0a7882899b..d00a8de6d8f5 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -244,11 +244,9 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
 #endif
 
 static bool check_exported_symbol(const struct symsearch *syms,
-				  struct module *owner,
-				  unsigned int symnum, void *data)
+				  struct module *owner, unsigned int symnum,
+				  struct find_symbol_arg *fsa)
 {
-	struct find_symbol_arg *fsa = data;
-
 	if (!fsa->gplok && syms->license == GPL_ONLY)
 		return false;
 	fsa->owner = owner;
@@ -285,16 +283,15 @@ int cmp_name(const void *name, const void *sym)
 
 static bool find_exported_symbol_in_section(const struct symsearch *syms,
 					    struct module *owner,
-					    void *data)
+					    struct find_symbol_arg *fsa)
 {
-	struct find_symbol_arg *fsa = data;
 	struct kernel_symbol *sym;
 
 	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
 			sizeof(struct kernel_symbol), cmp_name);
 
 	if (sym != NULL && check_exported_symbol(syms, owner,
-						 sym - syms->start, data))
+						 sym - syms->start, fsa))
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From cdd66eb52fdaa9bdab7f1be8dc9162bf4acc64ae Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 5 May 2022 12:52:11 +0900
Subject: module: do not binary-search in __ksymtab_gpl if fsa->gplok is false

Currently, !fsa->gplok && syms->license == GPL_ONLY) is checked after
bsearch() succeeds.

It is meaningless to do the binary search in the GPL symbol table when
fsa->gplok is false because we know find_exported_symbol_in_section()
will fail anyway.

This check should be done before bsearch().

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index d00a8de6d8f5..8a068fff437c 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -247,8 +247,6 @@ static bool check_exported_symbol(const struct symsearch *syms,
 				  struct module *owner, unsigned int symnum,
 				  struct find_symbol_arg *fsa)
 {
-	if (!fsa->gplok && syms->license == GPL_ONLY)
-		return false;
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
 	fsa->sym = &syms->start[symnum];
@@ -287,6 +285,9 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
 {
 	struct kernel_symbol *sym;
 
+	if (!fsa->gplok && syms->license == GPL_ONLY)
+		return false;
+
 	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
 			sizeof(struct kernel_symbol), cmp_name);
 
-- 
cgit v1.2.3


From 7390b94a3c2d93272d6da4945b81a9cf78055b7b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 5 May 2022 12:52:12 +0900
Subject: module: merge check_exported_symbol() into
 find_exported_symbol_in_section()

Now check_exported_symbol() always succeeds.

Merge it into find_exported_symbol_in_search() to make the code concise.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/main.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/main.c b/kernel/module/main.c
index 8a068fff437c..fed58d30725d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -243,17 +243,6 @@ static __maybe_unused void *any_section_objs(const struct load_info *info,
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-static bool check_exported_symbol(const struct symsearch *syms,
-				  struct module *owner, unsigned int symnum,
-				  struct find_symbol_arg *fsa)
-{
-	fsa->owner = owner;
-	fsa->crc = symversion(syms->crcs, symnum);
-	fsa->sym = &syms->start[symnum];
-	fsa->license = syms->license;
-	return true;
-}
-
 static const char *kernel_symbol_name(const struct kernel_symbol *sym)
 {
 #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
@@ -290,12 +279,15 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
 
 	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
 			sizeof(struct kernel_symbol), cmp_name);
+	if (!sym)
+		return false;
 
-	if (sym != NULL && check_exported_symbol(syms, owner,
-						 sym - syms->start, fsa))
-		return true;
+	fsa->owner = owner;
+	fsa->crc = symversion(syms->crcs, sym - syms->start);
+	fsa->sym = sym;
+	fsa->license = syms->license;
 
-	return false;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From a7bd57b87f65e0e1c5d41baf51a0d0b49fb30808 Mon Sep 17 00:00:00 2001
From: lizhe <sensor1010@163.com>
Date: Thu, 12 May 2022 20:38:36 -0700
Subject: kernel/crash_core.c: remove redundant check of ck_cmdline

At the end of get_last_crashkernel(), the judgement of ck_cmdline is
obviously unnecessary and causes redundance, let's clean it up.

Link: https://lkml.kernel.org/r/20220506104116.259323-1-sensor1010@163.com
Signed-off-by: lizhe <sensor1010@163.com>
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Philipp Rudo <prudo@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 256cf6db573c..c232f01a2c54 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -222,9 +222,6 @@ next:
 		p = strstr(p+1, name);
 	}
 
-	if (!ck_cmdline)
-		return NULL;
-
 	return ck_cmdline;
 }
 
-- 
cgit v1.2.3


From 47b7eae62aa7dc69f0e6d12493e5468ba57bf074 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Thu, 12 May 2022 20:38:37 -0700
Subject: relay: remove redundant assignment to pointer buf

Pointer buf is being assigned a value that is not being read, buf is being
re-assigned in the next starement.  The assignment is redundant and can be
removed.

Cleans up clang scan build warning:
kernel/relay.c:443:8: warning: Although the value stored to 'buf' is
used in the enclosing expression, the value is never actually read
from 'buf' [deadcode.DeadStores]

Link: https://lkml.kernel.org/r/20220508212152.58753-1-colin.i.king@gmail.com
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/relay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index d1a67fbb819d..6a611e779e95 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu)
 
 	mutex_lock(&relay_channels_mutex);
 	list_for_each_entry(chan, &relay_channels, list) {
-		if ((buf = *per_cpu_ptr(chan->buf, cpu)))
+		if (*per_cpu_ptr(chan->buf, cpu))
 			continue;
 		buf = relay_open_buf(chan, cpu);
 		if (!buf) {
-- 
cgit v1.2.3


From 6829061315065c7af394d556a887fbf847e4e708 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 11 May 2022 17:29:22 +0200
Subject: futex: Remove a PREEMPT_RT_FULL reference.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier the PREEMPT_RT patch had a PREEMPT_RT_FULL and PREEMPT_RT_BASE
Kconfig option. The latter was a subset of the functionality that was
enabled with PREEMPT_RT_FULL and was mainly useful for debugging.

During the merging efforts the two Kconfig options were abandoned in the
v5.4.3-rt1 release and since then there is only PREEMPT_RT which enables
the full features set (as PREEMPT_RT_FULL did in earlier releases).

Replace the PREEMPT_RT_FULL reference with PREEMPT_RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/YnvWUvq1vpqCfCU7@linutronix.de
---
 kernel/futex/pi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 183b28c32c83..ce2889f12375 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1005,7 +1005,7 @@ retry_private:
 	rt_mutex_init_waiter(&rt_waiter);
 
 	/*
-	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
 	 * hold it while doing rt_mutex_start_proxy(), because then it will
 	 * include hb->lock in the blocking chain, even through we'll not in
 	 * fact hold it while blocking. This will lead it to report -EDEADLK
-- 
cgit v1.2.3


From 1521c607cabe7c7edb028e211e88ba1e0f19714e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 May 2022 08:44:29 +0200
Subject: swiotlb: don't panic when the swiotlb buffer can't be allocated

For historical reasons the switlb code paniced when the metadata could
not be allocated, but just printed a warning when the actual main
swiotlb buffer could not be allocated.  Restore this somewhat unexpected
behavior as changing it caused a boot failure on the Microchip RISC-V
PolarFire SoC Icicle kit.

Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl")
Reported-by: Conor Dooley <Conor.Dooley@microchip.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Tested-by: Conor Dooley <Conor.Dooley@microchip.com>
---
 kernel/dma/swiotlb.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e2ef0864eb1e..3e992a308c8a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -254,8 +254,10 @@ retry:
 		tlb = memblock_alloc(bytes, PAGE_SIZE);
 	else
 		tlb = memblock_alloc_low(bytes, PAGE_SIZE);
-	if (!tlb)
-		panic("%s: failed to allocate tlb structure\n", __func__);
+	if (!tlb) {
+		pr_warn("%s: failed to allocate tlb structure\n", __func__);
+		return;
+	}
 
 	if (remap && remap(tlb, nslabs) < 0) {
 		memblock_free(tlb, PAGE_ALIGN(bytes));
-- 
cgit v1.2.3


From a5e891321a219679d5a2828150a7dda29a47d8a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 May 2022 08:13:57 +0200
Subject: swiotlb: use the right nslabs value in swiotlb_init_remap

default_nslabs should only be used to initialize nslabs, after that we
need to use the local variable that can shrink when allocations or the
remap don't succeed.

Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 kernel/dma/swiotlb.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 3e992a308c8a..113e1e8aaca3 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -234,7 +234,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = default_nslabs;
-	size_t alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+	size_t alloc_size;
 	size_t bytes;
 	void *tlb;
 
@@ -249,7 +249,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 	 * memory encryption.
 	 */
 retry:
-	bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+	bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
 	if (flags & SWIOTLB_ANY)
 		tlb = memblock_alloc(bytes, PAGE_SIZE);
 	else
@@ -269,12 +269,13 @@ retry:
 		goto retry;
 	}
 
+	alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
 	mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
 	if (!mem->slots)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	swiotlb_init_io_tlb_mem(mem, __pa(tlb), default_nslabs, false);
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
 	mem->force_bounce = flags & SWIOTLB_FORCE;
 
 	if (flags & SWIOTLB_VERBOSE)
-- 
cgit v1.2.3


From 1b8e5d1a53696d92374acce2b19a649427f1ec1e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 May 2022 08:24:10 +0200
Subject: swiotlb: use the right nslabs-derived sizes in swiotlb_init_late

nslabs can shrink when allocations or the remap don't succeed, so make
sure to use it for all sizing.  For that remove the bytes value that
can get stale and replace it with local calculations and a boolean to
indicate if the originally requested size could not be allocated.

Fixes: 6424e31b1c05 ("swiotlb: remove swiotlb_init_with_tbl and swiotlb_init_late_with_tbl")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 kernel/dma/swiotlb.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 113e1e8aaca3..d6e62a6a42ce 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -297,9 +297,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
-	unsigned long bytes;
 	unsigned char *vstart = NULL;
 	unsigned int order;
+	bool retried = false;
 	int rc = 0;
 
 	if (swiotlb_force_disable)
@@ -308,7 +308,6 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 retry:
 	order = get_order(nslabs << IO_TLB_SHIFT);
 	nslabs = SLABS_PER_PAGE << order;
-	bytes = nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
 		vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
@@ -316,16 +315,13 @@ retry:
 		if (vstart)
 			break;
 		order--;
+		nslabs = SLABS_PER_PAGE << order;
+		retried = true;
 	}
 
 	if (!vstart)
 		return -ENOMEM;
 
-	if (order != get_order(bytes)) {
-		pr_warn("only able to allocate %ld MB\n",
-			(PAGE_SIZE << order) >> 20);
-		nslabs = SLABS_PER_PAGE << order;
-	}
 	if (remap)
 		rc = remap(vstart, nslabs);
 	if (rc) {
@@ -334,9 +330,15 @@ retry:
 		nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
 		if (nslabs < IO_TLB_MIN_SLABS)
 			return rc;
+		retried = true;
 		goto retry;
 	}
 
+	if (retried) {
+		pr_warn("only able to allocate %ld MB\n",
+			(PAGE_SIZE << order) >> 20);
+	}
+
 	mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 		get_order(array_size(sizeof(*mem->slots), nslabs)));
 	if (!mem->slots) {
@@ -344,7 +346,8 @@ retry:
 		return -ENOMEM;
 	}
 
-	set_memory_decrypted((unsigned long)vstart, bytes >> PAGE_SHIFT);
+	set_memory_decrypted((unsigned long)vstart,
+			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
 	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
 
 	swiotlb_print_info();
-- 
cgit v1.2.3


From 4b6313cf99b0d51b49aeaea98ec76ca8161ecb80 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 12 May 2022 18:10:24 -0700
Subject: bpf: Fix combination of jit blinding and pointers to bpf subprogs.

The combination of jit blinding and pointers to bpf subprogs causes:
[   36.989548] BUG: unable to handle page fault for address: 0000000100000001
[   36.990342] #PF: supervisor instruction fetch in kernel mode
[   36.990968] #PF: error_code(0x0010) - not-present page
[   36.994859] RIP: 0010:0x100000001
[   36.995209] Code: Unable to access opcode bytes at RIP 0xffffffd7.
[   37.004091] Call Trace:
[   37.004351]  <TASK>
[   37.004576]  ? bpf_loop+0x4d/0x70
[   37.004932]  ? bpf_prog_3899083f75e4c5de_F+0xe3/0x13b

The jit blinding logic didn't recognize that ld_imm64 with an address
of bpf subprogram is a special instruction and proceeded to randomize it.
By itself it wouldn't have been an issue, but jit_subprogs() logic
relies on two step process to JIT all subprogs and then JIT them
again when addresses of all subprogs are known.
Blinding process in the first JIT phase caused second JIT to miss
adjustment of special ld_imm64.

Fix this issue by ignoring special ld_imm64 instructions that don't have
user controlled constants and shouldn't be blinded.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220513011025.13344-1-alexei.starovoitov@gmail.com
---
 kernel/bpf/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 76f68d0a7ae8..9cc91f0f3115 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1434,6 +1434,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 	insn = clone->insnsi;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			/* ld_imm64 with an address of bpf subprog is not
+			 * a user controlled constant. Don't randomize it,
+			 * since it will conflict with jit_subprogs() logic.
+			 */
+			insn++;
+			i++;
+			continue;
+		}
+
 		/* We temporarily need to hold the original ld64 insn
 		 * so that we can still access the first part in the
 		 * second blinding run.
-- 
cgit v1.2.3


From 534aa1dc975ac883ad89110534585a96630802a0 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Mon, 9 May 2022 18:20:53 -0700
Subject: printk: stop including cache.h from printk.h

An inclusion of cache.h in printk.h was added in 2014 in commit
c28aa1f0a847 ("printk/cache: mark printk_once test variable
__read_mostly") in order to bring in the definition of __read_mostly.  The
usage of __read_mostly was later removed in commit 3ec25826ae33 ("printk:
Tie printk_once / printk_deferred_once into .data.once for reset") which
made the inclusion of cache.h unnecessary, so remove it.

We have a small amount of code that depended on the inclusion of cache.h
from printk.h; fix that code to include the appropriate header.

This fixes a circular inclusion on arm64 (linux/printk.h -> linux/cache.h
-> asm/cache.h -> linux/kasan-enabled.h -> linux/static_key.h ->
linux/jump_label.h -> linux/bug.h -> asm/bug.h -> linux/printk.h) that
would otherwise be introduced by the next patch.

Build tested using {allyesconfig,defconfig} x {arm64,x86_64}.

Link: https://linux-review.googlesource.com/id/I8fd51f72c9ef1f2d6afd3b2cbc875aa4792c1fba
Link: https://lkml.kernel.org/r/20220427195820.1716975-1-pcc@google.com
Signed-off-by: Peter Collingbourne <pcc@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/bpf/bpf_lru_list.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 6b12f06ee18c..4ea227c9c1ad 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -4,6 +4,7 @@
 #ifndef __BPF_LRU_LIST_H_
 #define __BPF_LRU_LIST_H_
 
+#include <linux/cache.h>
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
 
-- 
cgit v1.2.3


From 1366992e16bddd5e2d9a561687f367f9f802e2e4 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 10 Apr 2022 16:49:50 +0200
Subject: timekeeping: Add raw clock fallback for random_get_entropy()

The addition of random_get_entropy_fallback() provides access to
whichever time source has the highest frequency, which is useful for
gathering entropy on platforms without available cycle counters. It's
not necessarily as good as being able to quickly access a cycle counter
that the CPU has, but it's still something, even when it falls back to
being jiffies-based.

In the event that a given arch does not define get_cycles(), falling
back to the get_cycles() default implementation that returns 0 is really
not the best we can do. Instead, at least calling
random_get_entropy_fallback() would be preferable, because that always
needs to return _something_, even falling back to jiffies eventually.
It's not as though random_get_entropy_fallback() is super high precision
or guaranteed to be entropic, but basically anything that's not zero all
the time is better than returning zero all the time.

Finally, since random_get_entropy_fallback() is used during extremely
early boot when randomizing freelists in mm_init(), it can be called
before timekeeping has been initialized. In that case there really is
nothing we can do; jiffies hasn't even started ticking yet. So just give
up and return 0.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Theodore Ts'o <tytso@mit.edu>
---
 kernel/time/timekeeping.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b1398fbddaf..871c912860ed 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
 #include <linux/time.h>
+#include <linux/timex.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
@@ -2380,6 +2381,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
 	return 0;
 }
 
+/**
+ * random_get_entropy_fallback - Returns the raw clock source value,
+ * used by random.c for platforms with no valid random_get_entropy().
+ */
+unsigned long random_get_entropy_fallback(void)
+{
+	struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+	struct clocksource *clock = READ_ONCE(tkr->clock);
+
+	if (unlikely(timekeeping_suspended || !clock))
+		return 0;
+	return clock->read(clock);
+}
+EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
 
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
-- 
cgit v1.2.3


From 16d1e00c7e8a4950e914223b3112144289a82913 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 9 May 2022 15:42:52 -0700
Subject: bpf: Add MEM_UNINIT as a bpf_type_flag

Instead of having uninitialized versions of arguments as separate
bpf_arg_types (eg ARG_PTR_TO_UNINIT_MEM as the uninitialized version
of ARG_PTR_TO_MEM), we can instead use MEM_UNINIT as a bpf_type_flag
modifier to denote that the argument is uninitialized.

Doing so cleans up some of the logic in the verifier. We no longer
need to do two checks against an argument type (eg "if
(base_type(arg_type) == ARG_PTR_TO_MEM || base_type(arg_type) ==
ARG_PTR_TO_UNINIT_MEM)"), since uninitialized and initialized
versions of the same argument type will now share the same base type.

In the near future, MEM_UNINIT will be used by dynptr helper functions
as well.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20220509224257.3222614-2-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c  |  4 ++--
 kernel/bpf/verifier.c | 28 ++++++++--------------------
 2 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d5f104a39092..bad96131a510 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -103,7 +103,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
 };
 
 BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
@@ -116,7 +116,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
-	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE | MEM_UNINIT,
 };
 
 BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 05c1b6656824..9b59581026f8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5377,12 +5377,6 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
-static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
-{
-	return base_type(type) == ARG_PTR_TO_MEM ||
-	       base_type(type) == ARG_PTR_TO_UNINIT_MEM;
-}
-
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -5522,7 +5516,6 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
 	[ARG_PTR_TO_MAP_VALUE]		= &map_key_value_types,
-	[ARG_PTR_TO_UNINIT_MAP_VALUE]	= &map_key_value_types,
 	[ARG_CONST_SIZE]		= &scalar_types,
 	[ARG_CONST_SIZE_OR_ZERO]	= &scalar_types,
 	[ARG_CONST_ALLOC_SIZE_OR_ZERO]	= &scalar_types,
@@ -5536,7 +5529,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,
 	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,
 	[ARG_PTR_TO_MEM]		= &mem_types,
-	[ARG_PTR_TO_UNINIT_MEM]		= &mem_types,
 	[ARG_PTR_TO_ALLOC_MEM]		= &alloc_mem_types,
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
@@ -5710,8 +5702,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		return -EACCES;
 	}
 
-	if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
-	    base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+	if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
 		err = resolve_map_arg_type(env, meta, &arg_type);
 		if (err)
 			return err;
@@ -5797,8 +5788,7 @@ skip_type_check:
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->key_size, false,
 					      NULL);
-	} else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
-		   base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+	} else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
 		if (type_may_be_null(arg_type) && register_is_null(reg))
 			return 0;
 
@@ -5810,7 +5800,7 @@ skip_type_check:
 			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
+		meta->raw_mode = arg_type & MEM_UNINIT;
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->value_size, false,
 					      meta);
@@ -5837,11 +5827,11 @@ skip_type_check:
 			return -EACCES;
 	} else if (arg_type == ARG_PTR_TO_FUNC) {
 		meta->subprogno = reg->subprogno;
-	} else if (arg_type_is_mem_ptr(arg_type)) {
+	} else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
 		/* The access to this pointer is only checked when we hit the
 		 * next is_mem_size argument below.
 		 */
-		meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM);
+		meta->raw_mode = arg_type & MEM_UNINIT;
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
@@ -6194,10 +6184,8 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
 static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
 				    enum bpf_arg_type arg_next)
 {
-	return (arg_type_is_mem_ptr(arg_curr) &&
-	        !arg_type_is_mem_size(arg_next)) ||
-	       (!arg_type_is_mem_ptr(arg_curr) &&
-		arg_type_is_mem_size(arg_next));
+	return (base_type(arg_curr) == ARG_PTR_TO_MEM) !=
+		arg_type_is_mem_size(arg_next);
 }
 
 static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -6208,7 +6196,7 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	 * helper function specification.
 	 */
 	if (arg_type_is_mem_size(fn->arg1_type) ||
-	    arg_type_is_mem_ptr(fn->arg5_type)  ||
+	    base_type(fn->arg5_type) == ARG_PTR_TO_MEM ||
 	    check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
 	    check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
 	    check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
-- 
cgit v1.2.3


From 317f29c14d0cca09952f1022491454b23455ebcb Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 11 May 2022 13:19:51 -0700
Subject: timers: Provide a better debugobjects hint for delayed works

With debugobjects enabled the timer hint for freeing of active timers
embedded inside delayed works is always the same, i.e. the hint is
delayed_work_timer_fn, even though the function the delayed work is going
to run can be wildly different depending on what work was queued.  Enabling
workqueue debugobjects doesn't help either because the delayed work isn't
considered active until it is actually queued to run on a workqueue. If the
work is freed while the timer is pending the work isn't considered active
so there is no information from workqueue debugobjects.

Special case delayed works in the timer debugobjects hint logic so that the
delayed work function is returned instead of the delayed_work_timer_fn.
This will help to understand which delayed work was pending that got
freed.

Apply the same treatment for kthread_delayed_work because it follows the
same pattern.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20220511201951.42408-1-swboyd@chromium.org
---
 kernel/time/timer.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ef082d43c307..a0666d948147 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -638,9 +638,39 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
 
 static const struct debug_obj_descr timer_debug_descr;
 
+struct timer_hint {
+	void	(*function)(struct timer_list *t);
+	long	offset;
+};
+
+#define TIMER_HINT(fn, container, timr, hintfn)			\
+	{							\
+		.function = fn,					\
+		.offset	  = offsetof(container, hintfn) -	\
+			    offsetof(container, timr)		\
+	}
+
+static const struct timer_hint timer_hints[] = {
+	TIMER_HINT(delayed_work_timer_fn,
+		   struct delayed_work, timer, work.func),
+	TIMER_HINT(kthread_delayed_work_timer_fn,
+		   struct kthread_delayed_work, timer, work.func),
+};
+
 static void *timer_debug_hint(void *addr)
 {
-	return ((struct timer_list *) addr)->function;
+	struct timer_list *timer = addr;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
+		if (timer_hints[i].function == timer->function) {
+			void (**fn)(void) = addr + timer_hints[i].offset;
+
+			return *fn;
+		}
+	}
+
+	return timer->function;
 }
 
 static bool timer_is_static_object(void *addr)
-- 
cgit v1.2.3


From 21673fcb2532dcd189905ff5a5389eb7dcd0e57a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 11 May 2022 13:07:50 +0200
Subject: genirq/irq_sim: Make the irq_work always run in hard irq context

The IRQ simulator uses irq_work to trigger an interrupt. Without the
IRQ_WORK_HARD_IRQ flag the irq_work will be performed in thread context
on PREEMPT_RT. This causes locking errors later in handle_simple_irq()
which expects to be invoked with disabled interrupts.

Triggering individual interrupts in hardirq context should not lead to
unexpected high latencies since this is also what the hardware
controller does. Also it is used as a simulator so...

Use IRQ_WORK_INIT_HARD() to carry out the irq_work in hardirq context on
PREEMPT_RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/YnuZBoEVMGwKkLm+@linutronix.de
---
 kernel/irq/irq_sim.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 0cd02efa3a74..dd76323ea3fd 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
 		goto err_free_bitmap;
 
 	work_ctx->irq_count = num_irqs;
-	init_irq_work(&work_ctx->work, irq_sim_handle_irq);
+	work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
 
 	return work_ctx->domain;
 
-- 
cgit v1.2.3


From 2434031c7cb4906be2d380981caa1f87d8899288 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 4 May 2022 09:09:41 +0200
Subject: kcsan: test: use new suite_{init,exit} support

Use the newly added suite_{init,exit} support for suite-wide init and
cleanup. This avoids the unsupported method by which the test used to do
suite-wide init and cleanup (avoiding issues such as missing TAP
headers, and possible future conflicts).

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 kernel/kcsan/kcsan_test.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a36fca063a73..59560b5e1d9c 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1565,14 +1565,6 @@ static void test_exit(struct kunit *test)
 	torture_cleanup_end();
 }
 
-static struct kunit_suite kcsan_test_suite = {
-	.name = "kcsan",
-	.test_cases = kcsan_test_cases,
-	.init = test_init,
-	.exit = test_exit,
-};
-static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
-
 __no_kcsan
 static void register_tracepoints(struct tracepoint *tp, void *ignore)
 {
@@ -1588,11 +1580,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
 		tracepoint_probe_unregister(tp, probe_console, NULL);
 }
 
-/*
- * We only want to do tracepoints setup and teardown once, therefore we have to
- * customize the init and exit functions and cannot rely on kunit_test_suite().
- */
-static int __init kcsan_test_init(void)
+static int kcsan_suite_init(struct kunit_suite *suite)
 {
 	/*
 	 * Because we want to be able to build the test as a module, we need to
@@ -1600,18 +1588,25 @@ static int __init kcsan_test_init(void)
 	 * won't work here.
 	 */
 	for_each_kernel_tracepoint(register_tracepoints, NULL);
-	return __kunit_test_suites_init(kcsan_test_suites);
+	return 0;
 }
 
-static void kcsan_test_exit(void)
+static void kcsan_suite_exit(struct kunit_suite *suite)
 {
-	__kunit_test_suites_exit(kcsan_test_suites);
 	for_each_kernel_tracepoint(unregister_tracepoints, NULL);
 	tracepoint_synchronize_unregister();
 }
 
-late_initcall_sync(kcsan_test_init);
-module_exit(kcsan_test_exit);
+static struct kunit_suite kcsan_test_suite = {
+	.name = "kcsan",
+	.test_cases = kcsan_test_cases,
+	.init = test_init,
+	.exit = test_exit,
+	.suite_init = kcsan_suite_init,
+	.suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
 
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Marco Elver <elver@google.com>");
-- 
cgit v1.2.3


From 82806744fd7dde603b64c151eeddaa4ee62193fd Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Tue, 10 May 2022 10:21:09 -0400
Subject: swiotlb: max mapping size takes min align mask into account

swiotlb_find_slots() skips slots according to io tlb aligned mask
calculated from min aligned mask and original physical address
offset. This affects max mapping size. The mapping size can't
achieve the IO_TLB_SEGSIZE * IO_TLB_SIZE when original offset is
non-zero. This will cause system boot up failure in Hyper-V
Isolation VM where swiotlb force is enabled. Scsi layer use return
value of dma_max_mapping_size() to set max segment size and it
finally calls swiotlb_max_mapping_size(). Hyper-V storage driver
sets min align mask to 4k - 1. Scsi layer may pass 256k length of
request buffer with 0~4k offset and Hyper-V storage driver can't
get swiotlb bounce buffer via DMA API. Swiotlb_find_slots() can't
find 256k length bounce buffer with offset. Make swiotlb_max_mapping
_size() take min align mask into account.

Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d6e62a6a42ce..dfa1de89dc94 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -736,7 +736,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 size_t swiotlb_max_mapping_size(struct device *dev)
 {
-	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
+	int min_align_mask = dma_get_min_align_mask(dev);
+	int min_align = 0;
+
+	/*
+	 * swiotlb_find_slots() skips slots according to
+	 * min align mask. This affects max mapping size.
+	 * Take it into acount here.
+	 */
+	if (min_align_mask)
+		min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+	return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
 }
 
 bool is_swiotlb_active(struct device *dev)
-- 
cgit v1.2.3


From 69e9cd66ae1392437234a63a3a1d60b6655f92ef Mon Sep 17 00:00:00 2001
From: Julian Orth <ju.orth@gmail.com>
Date: Tue, 17 May 2022 12:32:53 +0200
Subject: audit,io_uring,io-wq: call __audit_uring_exit for dummy contexts

Not calling the function for dummy contexts will cause the context to
not be reset. During the next syscall, this will cause an error in
__audit_syscall_entry:

	WARN_ON(context->context != AUDIT_CTX_UNUSED);
	WARN_ON(context->name_count);
	if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
		audit_panic("unrecoverable error in audit_syscall_entry()");
		return;
	}

These problematic dummy contexts are created via the following call
chain:

       exit_to_user_mode_prepare
    -> arch_do_signal_or_restart
    -> get_signal
    -> task_work_run
    -> tctx_task_work
    -> io_req_task_submit
    -> io_issue_sqe
    -> audit_uring_entry

Cc: stable@vger.kernel.org
Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Julian Orth <ju.orth@gmail.com>
[PM: subject line tweaks]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ea2ee1181921..f3a2abd6d1a1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1959,6 +1959,12 @@ void __audit_uring_exit(int success, long code)
 {
 	struct audit_context *ctx = audit_context();
 
+	if (ctx->dummy) {
+		if (ctx->context != AUDIT_CTX_URING)
+			return;
+		goto out;
+	}
+
 	if (ctx->context == AUDIT_CTX_SYSCALL) {
 		/*
 		 * NOTE: See the note in __audit_uring_entry() about the case
-- 
cgit v1.2.3


From d4150779e60fb6c49be25572596b2cdfc5d46a09 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 11 May 2022 16:11:29 +0200
Subject: random32: use real rng for non-deterministic randomness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

random32.c has two random number generators in it: one that is meant to
be used deterministically, with some predefined seed, and one that does
the same exact thing as random.c, except does it poorly. The first one
has some use cases. The second one no longer does and can be replaced
with calls to random.c's proper random number generator.

The relatively recent siphash-based bad random32.c code was added in
response to concerns that the prior random32.c was too deterministic.
Out of fears that random.c was (at the time) too slow, this code was
anonymously contributed. Then out of that emerged a kind of shadow
entropy gathering system, with its own tentacles throughout various net
code, added willy nilly.

Stop👏making👏bespoke👏random👏number👏generators👏.

Fortunately, recent advances in random.c mean that we can stop playing
with this sketchiness, and just use get_random_u32(), which is now fast
enough. In micro benchmarks using RDPMC, I'm seeing the same median
cycle count between the two functions, with the mean being _slightly_
higher due to batches refilling (which we can optimize further need be).
However, when doing *real* benchmarks of the net functions that actually
use these random numbers, the mean cycles actually *decreased* slightly
(with the median still staying the same), likely because the additional
prandom code means icache misses and complexity, whereas random.c is
generally already being used by something else nearby.

The biggest benefit of this is that there are many users of prandom who
probably should be using cryptographically secure random numbers. This
makes all of those accidental cases become secure by just flipping a
switch. Later on, we can do a tree-wide cleanup to remove the static
inline wrapper functions that this commit adds.

There are also some low-ish hanging fruits for making this even faster
in the future: a get_random_u16() function for use in the networking
stack will give a 2x performance boost there, using SIMD for ChaCha20
will let us compute 4 or 8 or 16 blocks of output in parallel, instead
of just one, giving us large buffers for cheap, and introducing a
get_random_*_bh() function that assumes irqs are already disabled will
shave off a few cycles for ordinary calls. These are things we can chip
away at down the road.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 kernel/time/timer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 9dd2a39cb3b0..c12fe329c9ff 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1780,8 +1780,6 @@ void update_process_times(int user_tick)
 {
 	struct task_struct *p = current;
 
-	PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
 	/* Note: this timer irq context must be accounted for as well. */
 	account_process_tick(p, user_tick);
 	run_local_timers();
-- 
cgit v1.2.3


From 29ed17389c4dcd09c5be8d88ddf6f4f60241567d Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Tue, 17 May 2022 19:25:23 +0800
Subject: cgroup: Make cgroup_debug static

Make cgroup_debug static since it's only used in cgroup.c

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup-internal.h | 1 -
 kernel/cgroup/cgroup.c          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 6e36e854b512..5da09c74228d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
 #define TRACE_CGROUP_PATH_LEN 1024
 extern spinlock_t trace_cgroup_path_lock;
 extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
 extern void __init enable_debug_cgroup(void);
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index adb820e98f24..a97fd051430b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
 
 DEFINE_SPINLOCK(trace_cgroup_path_lock);
 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
 
 /*
  * Protects cgroup_idr and css_idr so that IDs can be released without
-- 
cgit v1.2.3


From 13dfd97a341a5cf9d15f415dd469f45e971ef12a Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Thu, 19 May 2022 14:02:32 +0300
Subject: notifier: Add atomic_notifier_call_chain_is_empty()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add atomic_notifier_call_chain_is_empty() that returns true if given
atomic call chain is empty.

The first user of this new notifier API function will be the kernel
power-off core code that will support power-off call chains. The core
code will need to check whether there is a power-off handler registered
at all in order to decide whether to halt machine or power it off.

Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/notifier.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index ba005ebf4730..137b902c985b 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -204,6 +204,19 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 NOKPROBE_SYMBOL(atomic_notifier_call_chain);
 
+/**
+ *	atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ *	@nh: Pointer to head of the atomic notifier chain
+ *
+ *	Checks whether notifier chain is empty.
+ *
+ *	Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
+{
+	return !rcu_access_pointer(nh->head);
+}
+
 /*
  *	Blocking notifier chain routines.  All access to the chain is
  *	synchronized by an rwsem.
-- 
cgit v1.2.3


From c82f898d873ce10d88f8d60b38c8dd19f1ed7c66 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:10 +0300
Subject: notifier: Add blocking/atomic_notifier_chain_register_unique_prio()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add variant of blocking/atomic_notifier_chain_register() functions that
allow registration of a notifier only if it has unique priority, otherwise
-EBUSY error code is returned by the new functions.

Reviewed-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/notifier.c | 88 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 137b902c985b..0d5bd62c480e 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -20,7 +20,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
  */
 
 static int notifier_chain_register(struct notifier_block **nl,
-				   struct notifier_block *n)
+				   struct notifier_block *n,
+				   bool unique_priority)
 {
 	while ((*nl) != NULL) {
 		if (unlikely((*nl) == n)) {
@@ -30,6 +31,8 @@ static int notifier_chain_register(struct notifier_block **nl,
 		}
 		if (n->priority > (*nl)->priority)
 			break;
+		if (n->priority == (*nl)->priority && unique_priority)
+			return -EBUSY;
 		nl = &((*nl)->next);
 	}
 	n->next = *nl;
@@ -144,12 +147,35 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
 	int ret;
 
 	spin_lock_irqsave(&nh->lock, flags);
-	ret = notifier_chain_register(&nh->head, n);
+	ret = notifier_chain_register(&nh->head, n, false);
 	spin_unlock_irqrestore(&nh->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
 
+/**
+ *	atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to an atomic notifier chain if there is no other
+ *	notifier registered using the same priority.
+ *
+ *	Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+					       struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_register(&nh->head, n, true);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
+
 /**
  *	atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
  *	@nh: Pointer to head of the atomic notifier chain
@@ -222,18 +248,9 @@ bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
  *	synchronized by an rwsem.
  */
 
-/**
- *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- *	@nh: Pointer to head of the blocking notifier chain
- *	@n: New entry in notifier chain
- *
- *	Adds a notifier to a blocking notifier chain.
- *	Must be called in process context.
- *
- *	Returns 0 on success, %-EEXIST on error.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
-		struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+					      struct notifier_block *n,
+					      bool unique_priority)
 {
 	int ret;
 
@@ -243,15 +260,48 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 	 * such times we must not call down_write().
 	 */
 	if (unlikely(system_state == SYSTEM_BOOTING))
-		return notifier_chain_register(&nh->head, n);
+		return notifier_chain_register(&nh->head, n, unique_priority);
 
 	down_write(&nh->rwsem);
-	ret = notifier_chain_register(&nh->head, n);
+	ret = notifier_chain_register(&nh->head, n, unique_priority);
 	up_write(&nh->rwsem);
 	return ret;
 }
+
+/**
+ *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a blocking notifier chain.
+ *	Must be called in process context.
+ *
+ *	Returns 0 on success, %-EEXIST on error.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return __blocking_notifier_chain_register(nh, n, false);
+}
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 
+/**
+ *	blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to an blocking notifier chain if there is no other
+ *	notifier registered using the same priority.
+ *
+ *	Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+						 struct notifier_block *n)
+{
+	return __blocking_notifier_chain_register(nh, n, true);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
+
 /**
  *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
  *	@nh: Pointer to head of the blocking notifier chain
@@ -354,7 +404,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 int raw_notifier_chain_register(struct raw_notifier_head *nh,
 		struct notifier_block *n)
 {
-	return notifier_chain_register(&nh->head, n);
+	return notifier_chain_register(&nh->head, n, false);
 }
 EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
 
@@ -433,10 +483,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
 	 * such times we must not call mutex_lock().
 	 */
 	if (unlikely(system_state == SYSTEM_BOOTING))
-		return notifier_chain_register(&nh->head, n);
+		return notifier_chain_register(&nh->head, n, false);
 
 	mutex_lock(&nh->mutex);
-	ret = notifier_chain_register(&nh->head, n);
+	ret = notifier_chain_register(&nh->head, n, false);
 	mutex_unlock(&nh->mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 232edc2f72f5beda3586360de6254d443820050a Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:11 +0300
Subject: kernel/reboot: Introduce sys-off handler API

In order to support power-off chaining we need to get rid of the global
pm_* variables, replacing them with the new kernel API functions that
support chaining.

Introduce new generic sys-off handler API that brings the following
features:

1. Power-off and restart handlers are registered using same API function
   that supports chaining, hence all power-off and restart modes will
   support chaining using this unified function.

2. Prevents notifier priority collisions by disallowing registration of
   multiple handlers at the non-default priority level.

3. Supports passing opaque user argument to callback, which allows us to
   remove global variables from drivers.

This patch adds support of the following sys-off modes:

- SYS_OFF_MODE_POWER_OFF_PREPARE that replaces global pm_power_off_prepare
  variable and provides chaining support for power-off-prepare handlers.

- SYS_OFF_MODE_POWER_OFF that replaces global pm_power_off variable and
  provides chaining support for power-off handlers.

- SYS_OFF_MODE_RESTART that provides a better restart API, removing a need
  from drivers to have a global scratch variable by utilizing the opaque
  callback argument.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6bcc5d6a6572..3e6867ee61e0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -48,6 +48,15 @@ int reboot_cpu;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
+struct sys_off_handler {
+	struct notifier_block nb;
+	int (*sys_off_cb)(struct sys_off_data *data);
+	void *cb_data;
+	enum sys_off_mode mode;
+	bool blocking;
+	void *list;
+};
+
 /*
  * If set, this is used for preparing the system to power off.
  */
@@ -281,6 +290,179 @@ void kernel_halt(void)
 }
 EXPORT_SYMBOL_GPL(kernel_halt);
 
+/*
+ *	Notifier list for kernel code which wants to be called
+ *	to prepare system for power off.
+ */
+static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list);
+
+/*
+ *	Notifier list for kernel code which wants to be called
+ *	to power off system.
+ */
+static ATOMIC_NOTIFIER_HEAD(power_off_handler_list);
+
+static int sys_off_notify(struct notifier_block *nb,
+			  unsigned long mode, void *cmd)
+{
+	struct sys_off_handler *handler;
+	struct sys_off_data data = {};
+
+	handler = container_of(nb, struct sys_off_handler, nb);
+	data.cb_data = handler->cb_data;
+	data.mode = mode;
+	data.cmd = cmd;
+
+	return handler->sys_off_cb(&data);
+}
+
+/**
+ *	register_sys_off_handler - Register sys-off handler
+ *	@mode: Sys-off mode
+ *	@priority: Handler priority
+ *	@callback: Callback function
+ *	@cb_data: Callback argument
+ *
+ *	Registers system power-off or restart handler that will be invoked
+ *	at the step corresponding to the given sys-off mode. Handler's callback
+ *	should return NOTIFY_DONE to permit execution of the next handler in
+ *	the call chain or NOTIFY_STOP to break the chain (in error case for
+ *	example).
+ *
+ *	Multiple handlers can be registered at the default priority level.
+ *
+ *	Only one handler can be registered at the non-default priority level,
+ *	otherwise ERR_PTR(-EBUSY) is returned.
+ *
+ *	Returns a new instance of struct sys_off_handler on success, or
+ *	an ERR_PTR()-encoded error code otherwise.
+ */
+struct sys_off_handler *
+register_sys_off_handler(enum sys_off_mode mode,
+			 int priority,
+			 int (*callback)(struct sys_off_data *data),
+			 void *cb_data)
+{
+	struct sys_off_handler *handler;
+	int err;
+
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler)
+		return ERR_PTR(-ENOMEM);
+
+	switch (mode) {
+	case SYS_OFF_MODE_POWER_OFF_PREPARE:
+		handler->list = &power_off_prep_handler_list;
+		handler->blocking = true;
+		break;
+
+	case SYS_OFF_MODE_POWER_OFF:
+		handler->list = &power_off_handler_list;
+		break;
+
+	case SYS_OFF_MODE_RESTART:
+		handler->list = &restart_handler_list;
+		break;
+
+	default:
+		kfree(handler);
+		return ERR_PTR(-EINVAL);
+	}
+
+	handler->nb.notifier_call = sys_off_notify;
+	handler->nb.priority = priority;
+	handler->sys_off_cb = callback;
+	handler->cb_data = cb_data;
+	handler->mode = mode;
+
+	if (handler->blocking) {
+		if (priority == SYS_OFF_PRIO_DEFAULT)
+			err = blocking_notifier_chain_register(handler->list,
+							       &handler->nb);
+		else
+			err = blocking_notifier_chain_register_unique_prio(handler->list,
+									   &handler->nb);
+	} else {
+		if (priority == SYS_OFF_PRIO_DEFAULT)
+			err = atomic_notifier_chain_register(handler->list,
+							     &handler->nb);
+		else
+			err = atomic_notifier_chain_register_unique_prio(handler->list,
+									 &handler->nb);
+	}
+
+	if (err) {
+		kfree(handler);
+		return ERR_PTR(err);
+	}
+
+	return handler;
+}
+EXPORT_SYMBOL_GPL(register_sys_off_handler);
+
+/**
+ *	unregister_sys_off_handler - Unregister sys-off handler
+ *	@handler: Sys-off handler
+ *
+ *	Unregisters given sys-off handler.
+ */
+void unregister_sys_off_handler(struct sys_off_handler *handler)
+{
+	int err;
+
+	if (!handler)
+		return;
+
+	if (handler->blocking)
+		err = blocking_notifier_chain_unregister(handler->list,
+							 &handler->nb);
+	else
+		err = atomic_notifier_chain_unregister(handler->list,
+						       &handler->nb);
+
+	/* sanity check, shall never happen */
+	WARN_ON(err);
+
+	kfree(handler);
+}
+EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
+
+static void devm_unregister_sys_off_handler(void *data)
+{
+	struct sys_off_handler *handler = data;
+
+	unregister_sys_off_handler(handler);
+}
+
+/**
+ *	devm_register_sys_off_handler - Register sys-off handler
+ *	@dev: Device that registers handler
+ *	@mode: Sys-off mode
+ *	@priority: Handler priority
+ *	@callback: Callback function
+ *	@cb_data: Callback argument
+ *
+ *	Registers resource-managed sys-off handler.
+ *
+ *	Returns zero on success, or error code on failure.
+ */
+int devm_register_sys_off_handler(struct device *dev,
+				  enum sys_off_mode mode,
+				  int priority,
+				  int (*callback)(struct sys_off_data *data),
+				  void *cb_data)
+{
+	struct sys_off_handler *handler;
+
+	handler = register_sys_off_handler(mode, priority, callback, cb_data);
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
+
+	return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
+					handler);
+}
+EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
+
 /**
  *	kernel_power_off - power_off the system
  *
-- 
cgit v1.2.3


From 7b9a3de9ffe76e4a965bafe26c91d9c6351e1e18 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:12 +0300
Subject: kernel/reboot: Wrap legacy power-off callbacks into sys-off handlers

Wrap legacy power-off callbacks into sys-off handlers in order to
support co-existence of both legacy and new callbacks while we're
in process of upgrading legacy callbacks to the new API.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3e6867ee61e0..96f681f7b20c 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -463,6 +463,47 @@ int devm_register_sys_off_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
 
+static int legacy_pm_power_off_prepare(struct sys_off_data *data)
+{
+	if (pm_power_off_prepare)
+		pm_power_off_prepare();
+
+	return NOTIFY_DONE;
+}
+
+static int legacy_pm_power_off(struct sys_off_data *data)
+{
+	if (pm_power_off)
+		pm_power_off();
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Register sys-off handlers for legacy PM callbacks. This allows legacy
+ * PM callbacks co-exist with the new sys-off API.
+ *
+ * TODO: Remove legacy handlers once all legacy PM users will be switched
+ *       to the sys-off based APIs.
+ */
+static int __init legacy_pm_init(void)
+{
+	register_sys_off_handler(SYS_OFF_MODE_POWER_OFF_PREPARE,
+				 SYS_OFF_PRIO_DEFAULT,
+				 legacy_pm_power_off_prepare, NULL);
+
+	register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT,
+				 legacy_pm_power_off, NULL);
+
+	return 0;
+}
+core_initcall(legacy_pm_init);
+
+static void do_kernel_power_off_prepare(void)
+{
+	blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
+}
+
 /**
  *	kernel_power_off - power_off the system
  *
@@ -471,8 +512,7 @@ EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
 void kernel_power_off(void)
 {
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
-	if (pm_power_off_prepare)
-		pm_power_off_prepare();
+	do_kernel_power_off_prepare();
 	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	pr_emerg("Power down\n");
-- 
cgit v1.2.3


From 2b6aa7332f8020dfdaffe340ff038aac4df35238 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:13 +0300
Subject: kernel/reboot: Add do_kernel_power_off()

Add do_kernel_power_off() helper that will remove open-coded pm_power_off
invocations from the architecture code. This is the first step on the way
to remove the global pm_power_off variable, which will allow us to
implement consistent power-off chaining support.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 96f681f7b20c..892d25aa12df 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -504,6 +504,19 @@ static void do_kernel_power_off_prepare(void)
 	blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
 }
 
+/**
+ *	do_kernel_power_off - Execute kernel power-off handler call chain
+ *
+ *	Expected to be called as last step of the power-off sequence.
+ *
+ *	Powers off the system immediately if a power-off handler function has
+ *	been registered. Otherwise does nothing.
+ */
+void do_kernel_power_off(void)
+{
+	atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+}
+
 /**
  *	kernel_power_off - power_off the system
  *
-- 
cgit v1.2.3


From 5d34b41aa420c3908793b43419a1097362ca2668 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:14 +0300
Subject: kernel/reboot: Add stub for pm_power_off

Add weak stub for the global pm_power_off callback variable. This will
allow us to remove pm_power_off definitions from arch/ code and transition
to the new sys-off based API that will replace the global variable.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 892d25aa12df..7723aff4d09b 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -57,6 +57,12 @@ struct sys_off_handler {
 	void *list;
 };
 
+/*
+ * Temporary stub that prevents linkage failure while we're in process
+ * of removing all uses of legacy pm_power_off() around the kernel.
+ */
+void __weak (*pm_power_off)(void);
+
 /*
  * If set, this is used for preparing the system to power off.
  */
-- 
cgit v1.2.3


From 0e2110d2e910e44cc7cf23fce14613e232602602 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:15 +0300
Subject: kernel/reboot: Add kernel_can_power_off()

Add kernel_can_power_off() helper that replaces open-coded checks of
the global pm_power_off variable. This is a necessary step towards
supporting chained power-off handlers.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 7723aff4d09b..fb3f49350927 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -523,6 +523,18 @@ void do_kernel_power_off(void)
 	atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
 }
 
+/**
+ *	kernel_can_power_off - check whether system can be powered off
+ *
+ *	Returns true if power-off handler is registered and system can be
+ *	powered off, false otherwise.
+ */
+bool kernel_can_power_off(void)
+{
+	return !atomic_notifier_call_chain_is_empty(&power_off_handler_list);
+}
+EXPORT_SYMBOL_GPL(kernel_can_power_off);
+
 /**
  *	kernel_power_off - power_off the system
  *
@@ -581,7 +593,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	/* Instead of trying to make the power_off code look like
 	 * halt when pm_power_off is not set do it the easy way.
 	 */
-	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
 		cmd = LINUX_REBOOT_CMD_HALT;
 
 	mutex_lock(&system_transition_mutex);
-- 
cgit v1.2.3


From fb61375ecfba49e153af561402f49f6fe3bebd39 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:16 +0300
Subject: kernel/reboot: Add register_platform_power_off()

Add platform-level registration helpers that will ease transition of the
arch/platform power-off callbacks to the new sys-off based API, allowing
us to remove the global pm_power_off variable in the future.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index fb3f49350927..673d3f54dcad 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -469,6 +469,61 @@ int devm_register_sys_off_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
 
+static struct sys_off_handler *platform_power_off_handler;
+
+static int platform_power_off_notify(struct sys_off_data *data)
+{
+	void (*platform_power_power_off_cb)(void) = data->cb_data;
+
+	platform_power_power_off_cb();
+
+	return NOTIFY_DONE;
+}
+
+/**
+ *	register_platform_power_off - Register platform-level power-off callback
+ *	@power_off: Power-off callback
+ *
+ *	Registers power-off callback that will be called as last step
+ *	of the power-off sequence. This callback is expected to be invoked
+ *	for the last resort. Only one platform power-off callback is allowed
+ *	to be registered at a time.
+ *
+ *	Returns zero on success, or error code on failure.
+ */
+int register_platform_power_off(void (*power_off)(void))
+{
+	struct sys_off_handler *handler;
+
+	handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+					   SYS_OFF_PRIO_PLATFORM,
+					   platform_power_off_notify,
+					   power_off);
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
+
+	platform_power_off_handler = handler;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_platform_power_off);
+
+/**
+ *	unregister_platform_power_off - Unregister platform-level power-off callback
+ *	@power_off: Power-off callback
+ *
+ *	Unregisters previously registered platform power-off callback.
+ */
+void unregister_platform_power_off(void (*power_off)(void))
+{
+	if (platform_power_off_handler &&
+	    platform_power_off_handler->cb_data == power_off) {
+		unregister_sys_off_handler(platform_power_off_handler);
+		platform_power_off_handler = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(unregister_platform_power_off);
+
 static int legacy_pm_power_off_prepare(struct sys_off_data *data)
 {
 	if (pm_power_off_prepare)
-- 
cgit v1.2.3


From 5b71808eb7c97815fc3b9c386ca0ef6daf2dc053 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:32 +0300
Subject: reboot: Remove pm_power_off_prepare()

All pm_power_off_prepare() users were converted to sys-off handler API.
Remove the obsolete global callback variable.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 673d3f54dcad..88c835167d9e 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -63,13 +63,6 @@ struct sys_off_handler {
  */
 void __weak (*pm_power_off)(void);
 
-/*
- * If set, this is used for preparing the system to power off.
- */
-
-void (*pm_power_off_prepare)(void);
-EXPORT_SYMBOL_GPL(pm_power_off_prepare);
-
 /**
  *	emergency_restart - reboot the system
  *
@@ -524,14 +517,6 @@ void unregister_platform_power_off(void (*power_off)(void))
 }
 EXPORT_SYMBOL_GPL(unregister_platform_power_off);
 
-static int legacy_pm_power_off_prepare(struct sys_off_data *data)
-{
-	if (pm_power_off_prepare)
-		pm_power_off_prepare();
-
-	return NOTIFY_DONE;
-}
-
 static int legacy_pm_power_off(struct sys_off_data *data)
 {
 	if (pm_power_off)
@@ -549,10 +534,6 @@ static int legacy_pm_power_off(struct sys_off_data *data)
  */
 static int __init legacy_pm_init(void)
 {
-	register_sys_off_handler(SYS_OFF_MODE_POWER_OFF_PREPARE,
-				 SYS_OFF_PRIO_DEFAULT,
-				 legacy_pm_power_off_prepare, NULL);
-
 	register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT,
 				 legacy_pm_power_off, NULL);
 
-- 
cgit v1.2.3


From d2c5415327171e6c1bca2dc4d8a77f450ecf7150 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:34 +0300
Subject: kernel/reboot: Add devm_register_power_off_handler()

Add devm_register_power_off_handler() helper that registers sys-off
handler using power-off mode and with a default priority. Most drivers
will want to register power-off handler with a default priority, so this
helper will reduce the boilerplate code and make code easier to read and
follow.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 88c835167d9e..4dbb6e29247a 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -462,6 +462,28 @@ int devm_register_sys_off_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
 
+/**
+ *	devm_register_power_off_handler - Register power-off handler
+ *	@dev: Device that registers callback
+ *	@callback: Callback function
+ *	@cb_data: Callback's argument
+ *
+ *	Registers resource-managed sys-off handler with a default priority
+ *	and using power-off mode.
+ *
+ *	Returns zero on success, or error code on failure.
+ */
+int devm_register_power_off_handler(struct device *dev,
+				    int (*callback)(struct sys_off_data *data),
+				    void *cb_data)
+{
+	return devm_register_sys_off_handler(dev,
+					     SYS_OFF_MODE_POWER_OFF,
+					     SYS_OFF_PRIO_DEFAULT,
+					     callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
+
 static struct sys_off_handler *platform_power_off_handler;
 
 static int platform_power_off_notify(struct sys_off_data *data)
-- 
cgit v1.2.3


From 6779db970bd287bb35b28bd5dc256fd7aef19d1c Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Tue, 10 May 2022 02:32:35 +0300
Subject: kernel/reboot: Add devm_register_restart_handler()

Add devm_register_restart_handler() helper that registers sys-off
handler using restart mode and with a default priority. Most drivers
will want to register restart handler with a default priority, so this
helper will reduce the boilerplate code and make code easier to read and
follow.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 4dbb6e29247a..bdb635842ec8 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -484,6 +484,28 @@ int devm_register_power_off_handler(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
 
+/**
+ *	devm_register_restart_handler - Register restart handler
+ *	@dev: Device that registers callback
+ *	@callback: Callback function
+ *	@cb_data: Callback's argument
+ *
+ *	Registers resource-managed sys-off handler with a default priority
+ *	and using restart mode.
+ *
+ *	Returns zero on success, or error code on failure.
+ */
+int devm_register_restart_handler(struct device *dev,
+				  int (*callback)(struct sys_off_data *data),
+				  void *cb_data)
+{
+	return devm_register_sys_off_handler(dev,
+					     SYS_OFF_MODE_RESTART,
+					     SYS_OFF_PRIO_DEFAULT,
+					     callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_restart_handler);
+
 static struct sys_off_handler *platform_power_off_handler;
 
 static int platform_power_off_notify(struct sys_off_data *data)
-- 
cgit v1.2.3


From 4853f68d158ac59b05985a6af5b7da7ccdbc14c8 Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Fri, 8 Apr 2022 18:09:09 +0800
Subject: kexec_file: Fix kexec_file.c build error for riscv platform

When CONFIG_KEXEC_FILE is set for riscv platform, the compilation of
kernel/kexec_file.c generate build error:

kernel/kexec_file.c: In function 'crash_prepare_elf64_headers':
./arch/riscv/include/asm/page.h:110:71: error: request for member 'virt_addr' in something not a structure or union
  110 |  ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < kernel_map.virt_addr))
      |                                                                       ^
./arch/riscv/include/asm/page.h:131:2: note: in expansion of macro 'is_linear_mapping'
  131 |  is_linear_mapping(_x) ?       \
      |  ^~~~~~~~~~~~~~~~~
./arch/riscv/include/asm/page.h:140:31: note: in expansion of macro '__va_to_pa_nodebug'
  140 | #define __phys_addr_symbol(x) __va_to_pa_nodebug(x)
      |                               ^~~~~~~~~~~~~~~~~~
./arch/riscv/include/asm/page.h:143:24: note: in expansion of macro '__phys_addr_symbol'
  143 | #define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
      |                        ^~~~~~~~~~~~~~~~~~
kernel/kexec_file.c:1327:36: note: in expansion of macro '__pa_symbol'
 1327 |   phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);

This occurs is because the "kernel_map" referenced in macro
is_linear_mapping()  is suppose to be the one of struct kernel_mapping
defined in arch/riscv/mm/init.c, but the 2nd argument of
crash_prepare_elf64_header() has same symbol name, in expansion of macro
is_linear_mapping in function crash_prepare_elf64_header(), "kernel_map"
actually is the local variable.

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Link: https://lore.kernel.org/r/20220408100914.150110-2-lizhengyu3@huawei.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 kernel/kexec_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..331a4f0f10f5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1260,7 +1260,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	return 0;
 }
 
-int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
 	Elf64_Ehdr *ehdr;
@@ -1324,7 +1324,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	phdr++;
 
 	/* Prepare PT_LOAD type program header for kernel text region */
-	if (kernel_map) {
+	if (need_kernel_map) {
 		phdr->p_type = PT_LOAD;
 		phdr->p_flags = PF_R|PF_W|PF_X;
 		phdr->p_vaddr = (unsigned long) _text;
-- 
cgit v1.2.3


From d2081b2bf8195b8239c67fdd61518e077da7cbec Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 19 May 2022 14:08:49 -0700
Subject: mm: khugepaged: make khugepaged_enter() void function

The most callers of khugepaged_enter() don't care about the return value.
Only dup_mmap(), anonymous THP page fault and MADV_HUGEPAGE handle the
error by returning -ENOMEM.  Actually it is not harmful for them to ignore
the error case either.  It also sounds overkilling to fail fork() and page
fault early due to khugepaged_enter() error, and MADV_HUGEPAGE does set
VM_HUGEPAGE flag regardless of the error.

Link: https://lkml.kernel.org/r/20220510203222.24246-6-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Vlastmil Babka <vbabka@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897560ab..0d13baf86650 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -612,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	retval = ksm_fork(mm, oldmm);
 	if (retval)
 		goto out;
-	retval = khugepaged_fork(mm, oldmm);
-	if (retval)
-		goto out;
+	khugepaged_fork(mm, oldmm);
 
 	prev = NULL;
 	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
-- 
cgit v1.2.3


From 8491d1bdf5de152f27fc941e2dcdc4e66c950542 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 18 May 2022 20:49:53 +0200
Subject: sched/clock: Use try_cmpxchg64 in sched_clock_{local,remote}

Use try_cmpxchg64 instead of cmpxchg64 (*ptr, old, new) != old in
sched_clock_{local,remote}. x86 cmpxchg returns success in ZF flag,
so this change saves a compare after cmpxchg (and related move
instruction in front of cmpxchg).

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220518184953.3446778-1-ubizjak@gmail.com
---
 kernel/sched/clock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index d9272d9061a3..e374c0c923da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -287,7 +287,7 @@ again:
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
 
-	if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+	if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
 		goto again;
 
 	return clock;
@@ -349,7 +349,7 @@ again:
 		val = remote_clock;
 	}
 
-	if (cmpxchg64(ptr, old_val, val) != old_val)
+	if (!try_cmpxchg64(ptr, &old_val, val))
 		goto again;
 
 	return val;
-- 
cgit v1.2.3


From 546a3fee174969ff323d70ff27b1ef181f0d7ceb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 May 2022 13:46:54 +0200
Subject: sched: Reverse sched_class layout

Because GCC-12 is fully stupid about array bounds and it's just really
hard to get a solid array definition from a linker script, flip the
array order to avoid needing negative offsets :-/

This makes the whole relational pointer magic a little less obvious, but
alas.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/YoOLLmLG7HRTXeEm@hirez.programming.kicks-ass.net
---
 kernel/sched/core.c  | 12 ++++++------
 kernel/sched/sched.h | 15 ++++++++-------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2c9f5e97682f..66c4e5922fe1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2193,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->sched_class == rq->curr->sched_class)
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-	else if (p->sched_class > rq->curr->sched_class)
+	else if (sched_class_above(p->sched_class, rq->curr->sched_class))
 		resched_curr(rq);
 
 	/*
@@ -5692,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	 * higher scheduling class, because otherwise those lose the
 	 * opportunity to pull in more work from other CPUs.
 	 */
-	if (likely(prev->sched_class <= &fair_sched_class &&
+	if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
 
 		p = pick_next_task_fair(rq, prev, rf);
@@ -9472,11 +9472,11 @@ void __init sched_init(void)
 	int i;
 
 	/* Make sure the linker didn't screw up */
-	BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
-	       &fair_sched_class + 1 != &rt_sched_class ||
-	       &rt_sched_class + 1   != &dl_sched_class);
+	BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+	       &fair_sched_class != &rt_sched_class + 1 ||
+	       &rt_sched_class   != &dl_sched_class + 1);
 #ifdef CONFIG_SMP
-	BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
 #endif
 
 	wait_bit_init();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fe4d1acb7e38..2ce18584dca3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2177,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
  *
  *   include/asm-generic/vmlinux.lds.h
  *
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
  * Also enforce alignment on the instance, not the type, to guarantee layout.
  */
 #define DEFINE_SCHED_CLASS(name) \
@@ -2185,17 +2187,16 @@ const struct sched_class name##_sched_class \
 	__section("__" #name "_sched_class")
 
 /* Defined in include/asm-generic/vmlinux.lds.h */
-extern struct sched_class __begin_sched_classes[];
-extern struct sched_class __end_sched_classes[];
-
-#define sched_class_highest (__end_sched_classes - 1)
-#define sched_class_lowest  (__begin_sched_classes - 1)
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
 
 #define for_class_range(class, _from, _to) \
-	for (class = (_from); class != (_to); class--)
+	for (class = (_from); class < (_to); class++)
 
 #define for_each_class(class) \
-	for_class_range(class, sched_class_highest, sched_class_lowest)
+	for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b)	((_a) < (_b))
 
 extern const struct sched_class stop_sched_class;
 extern const struct sched_class dl_sched_class;
-- 
cgit v1.2.3


From 279b192c23d2fc9cb9e8c3851d6cad968f51789f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2022 16:11:23 -0500
Subject: blob_to_mnt(): kern_unmount() is needed to undo kern_mount()

plain mntput() won't do.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/usermode_driver.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 9dae1f648713..8303f4c7ca71 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -28,7 +28,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 
 	file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
 	if (IS_ERR(file)) {
-		mntput(mnt);
+		kern_unmount(mnt);
 		return ERR_CAST(file);
 	}
 
@@ -38,7 +38,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na
 		if (err >= 0)
 			err = -ENOMEM;
 		filp_close(file, NULL);
-		mntput(mnt);
+		kern_unmount(mnt);
 		return ERR_PTR(err);
 	}
 
-- 
cgit v1.2.3


From b154a017c92011d8f71ce804583e5f9c3d90bb9a Mon Sep 17 00:00:00 2001
From: Shida Zhang <starzhangzsd@gmail.com>
Date: Wed, 18 May 2022 09:36:47 +0800
Subject: cgroup: remove the superfluous judgment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the superfluous judgment since the function is
never called for a root cgroup, as suggested by Tejun.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a97fd051430b..1779ccddb734 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5685,7 +5685,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	css_clear_dir(&cgrp->self);
 	kernfs_remove(cgrp->kn);
 
-	if (parent && cgroup_is_threaded(cgrp))
+	if (cgroup_is_threaded(cgrp))
 		parent->nr_threaded_children--;
 
 	spin_lock_irq(&css_set_lock);
-- 
cgit v1.2.3


From 3ac6487e584a1eb54071dbe1212e05b884136704 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 May 2022 20:38:06 +0200
Subject: perf: Fix sys_perf_event_open() race against self

Norbert reported that it's possible to race sys_perf_event_open() such
that the looser ends up in another context from the group leader,
triggering many WARNs.

The move_group case checks for races against itself, but the
!move_group case doesn't, seemingly relying on the previous
group_leader->ctx == ctx check. However, that check is racy due to not
holding any locks at that time.

Therefore, re-check the result after acquiring locks and bailing
if they no longer match.

Additionally, clarify the not_move_group case from the
move_group-vs-move_group race.

Fixes: f63a8daa5812 ("perf: Fix event->ctx locking")
Reported-by: Norbert Slusarek <nslusarek@gmx.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/core.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7858bafffa9d..7f1e4c5897e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12217,6 +12217,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * Do not allow to attach to a group in a different task
 		 * or CPU context. If we're moving SW events, we'll fix
 		 * this up later, so allow that.
+		 *
+		 * Racy, not holding group_leader->ctx->mutex, see comment with
+		 * perf_event_ctx_lock().
 		 */
 		if (!move_group && group_leader->ctx != ctx)
 			goto err_context;
@@ -12282,6 +12285,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			} else {
 				perf_event_ctx_unlock(group_leader, gctx);
 				move_group = 0;
+				goto not_move_group;
 			}
 		}
 
@@ -12298,7 +12302,17 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	} else {
 		mutex_lock(&ctx->mutex);
+
+		/*
+		 * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
+		 * see the group_leader && !move_group test earlier.
+		 */
+		if (group_leader && group_leader->ctx != ctx) {
+			err = -EINVAL;
+			goto err_locked;
+		}
 	}
+not_move_group:
 
 	if (ctx->task == TASK_TOMBSTONE) {
 		err = -ESRCH;
-- 
cgit v1.2.3


From 3bc253c2e652cf5f12cd8c00d80d8ec55d67d1a7 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliang.tang@suse.com>
Date: Thu, 19 May 2022 16:30:10 -0700
Subject: bpf: Add bpf_skc_to_mptcp_sock_proto

This patch implements a new struct bpf_func_proto, named
bpf_skc_to_mptcp_sock_proto. Define a new bpf_id BTF_SOCK_TYPE_MPTCP,
and a new helper bpf_skc_to_mptcp_sock(), which invokes another new
helper bpf_mptcp_sock_from_subflow() in net/mptcp/bpf.c to get struct
mptcp_sock from a given subflow socket.

v2: Emit BTF type, add func_id checks in verifier.c and bpf_trace.c,
remove build check for CONFIG_BPF_JIT
v5: Drop EXPORT_SYMBOL (Martin)

Co-developed-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Nicolas Rybowski <nicolas.rybowski@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220519233016.105670-2-mathew.j.martineau@linux.intel.com
---
 kernel/bpf/verifier.c    | 1 +
 kernel/trace/bpf_trace.c | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9b59581026f8..14e8c17d3d8d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -509,6 +509,7 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id)
 		func_id == BPF_FUNC_skc_to_tcp_sock ||
 		func_id == BPF_FUNC_skc_to_tcp6_sock ||
 		func_id == BPF_FUNC_skc_to_udp6_sock ||
+		func_id == BPF_FUNC_skc_to_mptcp_sock ||
 		func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
 		func_id == BPF_FUNC_skc_to_tcp_request_sock;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7141ca8a1c2d..10b157a6d73e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1705,6 +1705,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_to_udp6_sock_proto;
 	case BPF_FUNC_skc_to_unix_sock:
 		return &bpf_skc_to_unix_sock_proto;
+	case BPF_FUNC_skc_to_mptcp_sock:
+		return &bpf_skc_to_mptcp_sock_proto;
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_tracing_proto;
 	case BPF_FUNC_sk_storage_delete:
-- 
cgit v1.2.3


From 979497674e63666a99fd7d242dba53a5ca5d628b Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Wed, 18 May 2022 22:59:08 +0200
Subject: bpf: Allow kfunc in tracing and syscall programs.

Tracing and syscall BPF program types are very convenient to add BPF
capabilities to subsystem otherwise not BPF capable.
When we add kfuncs capabilities to those program types, we can add
BPF features to subsystems without having to touch BPF core.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220518205924.399291-2-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2f0b0440131c..7bccaa4646e5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -202,6 +202,8 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_XDP,
 	BTF_KFUNC_HOOK_TC,
 	BTF_KFUNC_HOOK_STRUCT_OPS,
+	BTF_KFUNC_HOOK_TRACING,
+	BTF_KFUNC_HOOK_SYSCALL,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7110,6 +7112,10 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_TC;
 	case BPF_PROG_TYPE_STRUCT_OPS:
 		return BTF_KFUNC_HOOK_STRUCT_OPS;
+	case BPF_PROG_TYPE_TRACING:
+		return BTF_KFUNC_HOOK_TRACING;
+	case BPF_PROG_TYPE_SYSCALL:
+		return BTF_KFUNC_HOOK_SYSCALL;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
-- 
cgit v1.2.3


From c8644cd0efe719608ddcb341bcf087d4bc0bf6b8 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 19 May 2022 15:25:33 +0100
Subject: bpf: refine kernel.unprivileged_bpf_disabled behaviour

With unprivileged BPF disabled, all cmds associated with the BPF syscall
are blocked to users without CAP_BPF/CAP_SYS_ADMIN.  However there are
use cases where we may wish to allow interactions with BPF programs
without being able to load and attach them.  So for example, a process
with required capabilities loads/attaches a BPF program, and a process
with less capabilities interacts with it; retrieving perf/ring buffer
events, modifying map-specified config etc.  With all BPF syscall
commands blocked as a result of unprivileged BPF being disabled,
this mode of interaction becomes impossible for processes without
CAP_BPF.

As Alexei notes

"The bpf ACL model is the same as traditional file's ACL.
The creds and ACLs are checked at open().  Then during file's write/read
additional checks might be performed. BPF has such functionality already.
Different map_creates have capability checks while map_lookup has:
map_get_sys_perms(map, f) & FMODE_CAN_READ.
In other words it's enough to gate FD-receiving parts of bpf
with unprivileged_bpf_disabled sysctl.
The rest is handled by availability of FD and access to files in bpffs."

So key fd creation syscall commands BPF_PROG_LOAD and BPF_MAP_CREATE
are blocked with unprivileged BPF disabled and no CAP_BPF.

And as Alexei notes, map creation with unprivileged BPF disabled off
blocks creation of maps aside from array, hash and ringbuf maps.

Programs responsible for loading and attaching the BPF program
can still control access to its pinned representation by restricting
permissions on the pin path, as with normal files.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/1652970334-30510-2-git-send-email-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 72e53489165d..2b69306d3c6e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4863,9 +4863,21 @@ out_prog_put:
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
+	bool capable;
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+	capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
+
+	/* Intent here is for unprivileged_bpf_disabled to block key object
+	 * creation commands for unprivileged users; other actions depend
+	 * of fd availability and access to bpffs, so are dependent on
+	 * object creation success.  Capabilities are later verified for
+	 * operations such as load and map create, so even with unprivileged
+	 * BPF disabled, capability checks are still carried out for these
+	 * and other operations.
+	 */
+	if (!capable &&
+	    (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
-- 
cgit v1.2.3


From 4a37f3dd9a83186cb88d44808ab35b78375082c9 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 20 May 2022 18:10:13 +0100
Subject: dma-direct: don't over-decrypt memory

The original x86 sev_alloc() only called set_memory_decrypted() on
memory returned by alloc_pages_node(), so the page order calculation
fell out of that logic. However, the common dma-direct code has several
potential allocators, not all of which are guaranteed to round up the
underlying allocation to a power-of-two size, so carrying over that
calculation for the encryption/decryption size was a mistake. Fix it by
rounding to a *number* of pages, rather than an order.

Until recently there was an even worse interaction with DMA_DIRECT_REMAP
where we could have ended up decrypting part of the next adjacent
vmalloc area, only averted by no architecture actually supporting both
configs at once. Don't ask how I found that one out...

Fixes: c10f07aa27da ("dma/direct: Handle force decryption for DMA coherent buffers in common code")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Rientjes <rientjes@google.com>
---
 kernel/dma/direct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3e7f4aab740e..e978f36e6be8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -79,7 +79,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
 {
 	if (!force_dma_unencrypted(dev))
 		return 0;
-	return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+	return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
 }
 
 static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
@@ -88,7 +88,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
 
 	if (!force_dma_unencrypted(dev))
 		return 0;
-	ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+	ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
 	if (ret)
 		pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
 	return ret;
-- 
cgit v1.2.3


From d88bb5eed04ce50cc20e7f9282977841728be798 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 20 May 2022 16:57:51 -0700
Subject: bpf: Fill new bpf_prog_pack with illegal instructions

bpf_prog_pack enables sharing huge pages among multiple BPF programs.
These pages are marked as executable before the JIT engine fill it with
BPF programs. To make these pages safe, fill the hole bpf_prog_pack with
illegal instructions before making it executable.

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220520235758.1858153-2-song@kernel.org
---
 kernel/bpf/core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9cc91f0f3115..2d0c9d4696ad 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -873,7 +873,7 @@ static size_t select_bpf_prog_pack_size(void)
 	return size;
 }
 
-static struct bpf_prog_pack *alloc_new_pack(void)
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_prog_pack *pack;
 
@@ -886,6 +886,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 		kfree(pack);
 		return NULL;
 	}
+	bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
 	bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
 	list_add_tail(&pack->list, &pack_list);
 
@@ -895,7 +896,7 @@ static struct bpf_prog_pack *alloc_new_pack(void)
 	return pack;
 }
 
-static void *bpf_prog_pack_alloc(u32 size)
+static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
 	struct bpf_prog_pack *pack;
@@ -910,6 +911,7 @@ static void *bpf_prog_pack_alloc(u32 size)
 		size = round_up(size, PAGE_SIZE);
 		ptr = module_alloc(size);
 		if (ptr) {
+			bpf_fill_ill_insns(ptr, size);
 			set_vm_flush_reset_perms(ptr);
 			set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
 			set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
@@ -923,7 +925,7 @@ static void *bpf_prog_pack_alloc(u32 size)
 			goto found_free_area;
 	}
 
-	pack = alloc_new_pack();
+	pack = alloc_new_pack(bpf_fill_ill_insns);
 	if (!pack)
 		goto out;
 
@@ -1102,7 +1104,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	ro_header = bpf_prog_pack_alloc(size);
+	ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
 	if (!ro_header) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
-- 
cgit v1.2.3


From fe736565efb775620dbcf3c459c1cd80d3e868da Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 20 May 2022 16:57:53 -0700
Subject: bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack

Introduce bpf_arch_text_invalidate and use it to fill unused part of the
bpf_prog_pack with illegal instructions when a BPF program is freed.

Fixes: 57631054fae6 ("bpf: Introduce bpf_prog_pack allocator")
Fixes: 33c9805860e5 ("bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free]")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220520235758.1858153-4-song@kernel.org
---
 kernel/bpf/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2d0c9d4696ad..cacd8684c3c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -968,6 +968,9 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
 	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
 
+	WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
 	bitmap_clear(pack->bitmap, pos, nbits);
 	if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
 				       bpf_prog_chunk_count(), 0) == 0) {
@@ -2740,6 +2743,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
 	return ERR_PTR(-ENOTSUPP);
 }
 
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+	return -ENOTSUPP;
+}
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
-- 
cgit v1.2.3


From 1ec5ee8c8a5a65ea377f8bea64bf4d5b743f6f79 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 21 May 2022 18:56:20 +0530
Subject: bpf: Suppress 'passing zero to PTR_ERR' warning

Kernel Test Robot complains about passing zero to PTR_ERR for the said
line, suppress it by using PTR_ERR_OR_ZERO.

Fixes: c0a5a21c25f3 ("bpf: Allow storing referenced kptr in map")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220521132620.1976921-1-memxor@gmail.com
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14e8c17d3d8d..45153cbc2bd6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5352,7 +5352,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 		return -EINVAL;
 	}
 	if (!map_value_has_kptrs(map_ptr)) {
-		ret = PTR_ERR(map_ptr->kptr_off_tab);
+		ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
 		if (ret == -E2BIG)
 			verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
 				BPF_MAP_VALUE_OFF_MAX);
-- 
cgit v1.2.3


From 97e03f521050c092919591e668107b3d69c5f426 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:07 -0700
Subject: bpf: Add verifier support for dynptrs

This patch adds the bulk of the verifier work for supporting dynamic
pointers (dynptrs) in bpf.

A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure
defined internally as:

struct bpf_dynptr_kern {
    void *data;
    u32 size;
    u32 offset;
} __aligned(8);

The upper 8 bits of *size* is reserved (it contains extra metadata about
read-only status and dynptr type). Consequently, a dynptr only supports
memory less than 16 MB.

There are different types of dynptrs (eg malloc, ringbuf, ...). In this
patchset, the most basic one, dynptrs to a bpf program's local memory,
is added. For now only local memory that is of reg type PTR_TO_MAP_VALUE
is supported.

In the verifier, dynptr state information will be tracked in stack
slots. When the program passes in an uninitialized dynptr
(ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding
to the frame pointer where the dynptr resides at are marked
STACK_DYNPTR. For helper functions that take in initialized dynptrs (eg
bpf_dynptr_read + bpf_dynptr_write which are added later in this
patchset), the verifier enforces that the dynptr has been initialized
properly by checking that their corresponding stack slots have been
marked as STACK_DYNPTR.

The 6th patch in this patchset adds test cases that the verifier should
successfully reject, such as for example attempting to use a dynptr
after doing a direct write into it inside the bpf program.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-2-joannelkoong@gmail.com
---
 kernel/bpf/verifier.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45153cbc2bd6..1fd0b81c3fec 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -259,6 +259,7 @@ struct bpf_call_arg_meta {
 	u32 ret_btf_id;
 	u32 subprogno;
 	struct bpf_map_value_off_desc *kptr_off_desc;
+	u8 uninit_dynptr_regno;
 };
 
 struct btf *btf_vmlinux;
@@ -581,6 +582,7 @@ static char slot_type_char[] = {
 	[STACK_SPILL]	= 'r',
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
+	[STACK_DYNPTR]	= 'd',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -596,6 +598,25 @@ static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
+static int get_spi(s32 off)
+{
+	return (-off - 1) / BPF_REG_SIZE;
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+	int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+	/* We need to check that slots between [spi - nr_slots + 1, spi] are
+	 * within [0, allocated_stack).
+	 *
+	 * Please note that the spi grows downwards. For example, a dynptr
+	 * takes the size of two stack slots; the first slot will be at
+	 * spi and the second slot will be at spi - 1.
+	 */
+	return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
 static struct bpf_func_state *func(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg)
 {
@@ -647,6 +668,108 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
 	env->scratched_stack_slots = ~0ULL;
 }
 
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+	case DYNPTR_TYPE_LOCAL:
+		return BPF_DYNPTR_TYPE_LOCAL;
+	default:
+		return BPF_DYNPTR_TYPE_INVALID;
+	}
+}
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   enum bpf_arg_type arg_type, int insn_idx)
+{
+	struct bpf_func_state *state = func(env, reg);
+	enum bpf_dynptr_type type;
+	int spi, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_DYNPTR;
+		state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+	}
+
+	type = arg_to_dynptr_type(arg_type);
+	if (type == BPF_DYNPTR_TYPE_INVALID)
+		return -EINVAL;
+
+	state->stack[spi].spilled_ptr.dynptr.first_slot = true;
+	state->stack[spi].spilled_ptr.dynptr.type = type;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+
+	return 0;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_INVALID;
+		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	}
+
+	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
+	state->stack[spi].spilled_ptr.dynptr.type = 0;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+
+	return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return true;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
+			return false;
+	}
+
+	return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				     enum bpf_arg_type arg_type)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
+		return false;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+			return false;
+	}
+
+	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+	if (arg_type == ARG_PTR_TO_DYNPTR)
+		return true;
+
+	return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -5400,6 +5523,11 @@ static bool arg_type_is_release(enum bpf_arg_type type)
 	return type & OBJ_RELEASE;
 }
 
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+	return base_type(type) == ARG_PTR_TO_DYNPTR;
+}
+
 static int int_ptr_type_to_size(enum bpf_arg_type type)
 {
 	if (type == ARG_PTR_TO_INT)
@@ -5539,6 +5667,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
+	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5628,8 +5757,13 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	bool fixed_off_ok = false;
 
 	switch ((u32)type) {
-	case SCALAR_VALUE:
 	/* Pointer types where reg offset is explicitly allowed: */
+	case PTR_TO_STACK:
+		if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
+			verbose(env, "cannot pass in dynptr at an offset\n");
+			return -EINVAL;
+		}
+		fallthrough;
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
 	case PTR_TO_MAP_KEY:
@@ -5639,7 +5773,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	case PTR_TO_MEM | MEM_ALLOC:
 	case PTR_TO_BUF:
 	case PTR_TO_BUF | MEM_RDONLY:
-	case PTR_TO_STACK:
+	case SCALAR_VALUE:
 		/* Some of the argument types nevertheless require a
 		 * zero register offset.
 		 */
@@ -5837,6 +5971,36 @@ skip_type_check:
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
+	} else if (arg_type_is_dynptr(arg_type)) {
+		if (arg_type & MEM_UNINIT) {
+			if (!is_dynptr_reg_valid_uninit(env, reg)) {
+				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+				return -EINVAL;
+			}
+
+			/* We only support one dynptr being uninitialized at the moment,
+			 * which is sufficient for the helper functions we have right now.
+			 */
+			if (meta->uninit_dynptr_regno) {
+				verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+				return -EFAULT;
+			}
+
+			meta->uninit_dynptr_regno = regno;
+		} else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+			const char *err_extra = "";
+
+			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+			case DYNPTR_TYPE_LOCAL:
+				err_extra = "local ";
+				break;
+			default:
+				break;
+			}
+			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+				err_extra, arg + 1);
+			return -EINVAL;
+		}
 	} else if (arg_type_is_alloc_size(arg_type)) {
 		if (!tnum_is_const(reg->var_off)) {
 			verbose(env, "R%d is not a known constant'\n",
@@ -6970,9 +7134,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
+	if (meta.uninit_dynptr_regno) {
+		/* we write BPF_DW bits (8 bytes) at a time */
+		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
+					      fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
+					      insn_idx);
+		if (err)
+			return err;
+	}
+
 	if (meta.release_regno) {
 		err = -EINVAL;
-		if (meta.ref_obj_id)
+		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+		else if (meta.ref_obj_id)
 			err = release_reference(env, meta.ref_obj_id);
 		/* meta.ref_obj_id can only be 0 if register that is meant to be
 		 * released is NULL, which must be > R0.
-- 
cgit v1.2.3


From 263ae152e96253f40c2c276faad8629e096b3bad Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:08 -0700
Subject: bpf: Add bpf_dynptr_from_mem for local dynptrs

This patch adds a new api bpf_dynptr_from_mem:

long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr);

which initializes a dynptr to point to a bpf program's local memory. For now
only local memory that is of reg type PTR_TO_MAP_VALUE is supported.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-3-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c |  6 +++++
 2 files changed, 71 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bad96131a510..d3e935c2e25e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1412,6 +1412,69 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
 	.arg2_btf_id  = BPF_PTR_POISON,
 };
 
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT	28
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+	ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static int bpf_dynptr_check_size(u32 size)
+{
+	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+			    enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+	ptr->data = data;
+	ptr->offset = offset;
+	ptr->size = size;
+	bpf_dynptr_set_type(ptr, type);
+}
+
+static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+	memset(ptr, 0, sizeof(*ptr));
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+	int err;
+
+	err = bpf_dynptr_check_size(size);
+	if (err)
+		goto error;
+
+	/* flags is currently unsupported */
+	if (flags) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+	return 0;
+
+error:
+	bpf_dynptr_set_null(ptr);
+	return err;
+}
+
+const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+	.func		= bpf_dynptr_from_mem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1466,6 +1529,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_loop_proto;
 	case BPF_FUNC_strncmp:
 		return &bpf_strncmp_proto;
+	case BPF_FUNC_dynptr_from_mem:
+		return &bpf_dynptr_from_mem_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1fd0b81c3fec..b657d46f886e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7204,6 +7204,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
+	case BPF_FUNC_dynptr_from_mem:
+		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+			verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+				reg_type_str(env, regs[BPF_REG_1].type));
+			return -EACCES;
+		}
 	}
 
 	if (err)
-- 
cgit v1.2.3


From bc34dee65a65e9c920c420005b8a43f2a721a458 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:09 -0700
Subject: bpf: Dynptr support for ring buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, our only way of writing dynamically-sized data into a ring
buffer is through bpf_ringbuf_output but this incurs an extra memcpy
cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra
memcpy, but it can only safely support reservation sizes that are
statically known since the verifier cannot guarantee that the bpf
program won’t access memory outside the reserved space.

The bpf_dynptr abstraction allows for dynamically-sized ring buffer
reservations without the extra memcpy.

There are 3 new APIs:

long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr);
void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags);
void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags);

These closely follow the functionalities of the original ringbuf APIs.
For example, all ringbuffer dynptrs that have been reserved must be
either submitted or discarded before the program exits.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-4-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c  | 14 ++++++---
 kernel/bpf/ringbuf.c  | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++++++--
 3 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d3e935c2e25e..abb08999ff56 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1423,13 +1423,13 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-static int bpf_dynptr_check_size(u32 size)
+int bpf_dynptr_check_size(u32 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
 }
 
-static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
-			    enum bpf_dynptr_type type, u32 offset, u32 size)
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+		     enum bpf_dynptr_type type, u32 offset, u32 size)
 {
 	ptr->data = data;
 	ptr->offset = offset;
@@ -1437,7 +1437,7 @@ static void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 	bpf_dynptr_set_type(ptr, type);
 }
 
-static void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 {
 	memset(ptr, 0, sizeof(*ptr));
 }
@@ -1523,6 +1523,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
+	case BPF_FUNC_ringbuf_reserve_dynptr:
+		return &bpf_ringbuf_reserve_dynptr_proto;
+	case BPF_FUNC_ringbuf_submit_dynptr:
+		return &bpf_ringbuf_submit_dynptr_proto;
+	case BPF_FUNC_ringbuf_discard_dynptr:
+		return &bpf_ringbuf_discard_dynptr_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 311264ab80c4..ded4faeca192 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -475,3 +475,81 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = {
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+	   struct bpf_dynptr_kern *, ptr)
+{
+	struct bpf_ringbuf_map *rb_map;
+	void *sample;
+	int err;
+
+	if (unlikely(flags)) {
+		bpf_dynptr_set_null(ptr);
+		return -EINVAL;
+	}
+
+	err = bpf_dynptr_check_size(size);
+	if (err) {
+		bpf_dynptr_set_null(ptr);
+		return err;
+	}
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+	sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+	if (!sample) {
+		bpf_dynptr_set_null(ptr);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+	.func		= bpf_ringbuf_reserve_dynptr,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+	if (!ptr->data)
+		return 0;
+
+	bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+	bpf_dynptr_set_null(ptr);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+	.func		= bpf_ringbuf_submit_dynptr,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+	if (!ptr->data)
+		return 0;
+
+	bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+	bpf_dynptr_set_null(ptr);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+	.func		= bpf_ringbuf_discard_dynptr,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+	.arg2_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b657d46f886e..8be140351966 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -187,6 +187,9 @@ struct bpf_verifier_stack_elem {
 					  POISON_POINTER_DELTA))
 #define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
 
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
 	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -673,17 +676,24 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
 	case DYNPTR_TYPE_LOCAL:
 		return BPF_DYNPTR_TYPE_LOCAL;
+	case DYNPTR_TYPE_RINGBUF:
+		return BPF_DYNPTR_TYPE_RINGBUF;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
 }
 
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+	return type == BPF_DYNPTR_TYPE_RINGBUF;
+}
+
 static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   enum bpf_arg_type arg_type, int insn_idx)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type type;
-	int spi, i;
+	int spi, i, id;
 
 	spi = get_spi(reg->off);
 
@@ -703,6 +713,16 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
 	state->stack[spi].spilled_ptr.dynptr.type = type;
 	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
 
+	if (dynptr_type_refcounted(type)) {
+		/* The id is used to track proper releasing */
+		id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
+
+		state->stack[spi].spilled_ptr.id = id;
+		state->stack[spi - 1].spilled_ptr.id = id;
+	}
+
 	return 0;
 }
 
@@ -721,6 +741,13 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
 		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
 	}
 
+	/* Invalidate any slices associated with this dynptr */
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		release_reference(env, state->stack[spi].spilled_ptr.id);
+		state->stack[spi].spilled_ptr.id = 0;
+		state->stack[spi - 1].spilled_ptr.id = 0;
+	}
+
 	state->stack[spi].spilled_ptr.dynptr.first_slot = false;
 	state->stack[spi].spilled_ptr.dynptr.type = 0;
 	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
@@ -5859,7 +5886,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 
 skip_type_check:
 	if (arg_type_is_release(arg_type)) {
-		if (!reg->ref_obj_id && !register_is_null(reg)) {
+		if (arg_type_is_dynptr(arg_type)) {
+			struct bpf_func_state *state = func(env, reg);
+			int spi = get_spi(reg->off);
+
+			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+			    !state->stack[spi].spilled_ptr.id) {
+				verbose(env, "arg %d is an unacquired reference\n", regno);
+				return -EINVAL;
+			}
+		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
 			verbose(env, "R%d must be referenced when passed to release function\n",
 				regno);
 			return -EINVAL;
@@ -5994,9 +6030,13 @@ skip_type_check:
 			case DYNPTR_TYPE_LOCAL:
 				err_extra = "local ";
 				break;
+			case DYNPTR_TYPE_RINGBUF:
+				err_extra = "ringbuf ";
+				break;
 			default:
 				break;
 			}
+
 			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
 				err_extra, arg + 1);
 			return -EINVAL;
@@ -6122,7 +6162,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_MAP_TYPE_RINGBUF:
 		if (func_id != BPF_FUNC_ringbuf_output &&
 		    func_id != BPF_FUNC_ringbuf_reserve &&
-		    func_id != BPF_FUNC_ringbuf_query)
+		    func_id != BPF_FUNC_ringbuf_query &&
+		    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+		    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -6238,6 +6281,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_ringbuf_output:
 	case BPF_FUNC_ringbuf_reserve:
 	case BPF_FUNC_ringbuf_query:
+	case BPF_FUNC_ringbuf_reserve_dynptr:
+	case BPF_FUNC_ringbuf_submit_dynptr:
+	case BPF_FUNC_ringbuf_discard_dynptr:
 		if (map->map_type != BPF_MAP_TYPE_RINGBUF)
 			goto error;
 		break;
-- 
cgit v1.2.3


From 13bbbfbea7598ea9f8d9c3d73bf053bb57f9c4b2 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:10 -0700
Subject: bpf: Add bpf_dynptr_read and bpf_dynptr_write

This patch adds two helper functions, bpf_dynptr_read and
bpf_dynptr_write:

long bpf_dynptr_read(void *dst, u32 len, struct bpf_dynptr *src, u32 offset);

long bpf_dynptr_write(struct bpf_dynptr *dst, u32 offset, void *src, u32 len);

The dynptr passed into these functions must be valid dynptrs that have
been initialized.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-5-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abb08999ff56..8cef3fb0d143 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1417,12 +1417,24 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
  */
 #define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
 #define DYNPTR_TYPE_SHIFT	28
+#define DYNPTR_SIZE_MASK	0xFFFFFF
+#define DYNPTR_RDONLY_BIT	BIT(31)
+
+static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	return ptr->size & DYNPTR_RDONLY_BIT;
+}
 
 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
 {
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
+static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+{
+	return ptr->size & DYNPTR_SIZE_MASK;
+}
+
 int bpf_dynptr_check_size(u32 size)
 {
 	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
@@ -1442,6 +1454,16 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
+static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+{
+	u32 size = bpf_dynptr_get_size(ptr);
+
+	if (len > size || offset > size - len)
+		return -E2BIG;
+
+	return 0;
+}
+
 BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
 {
 	int err;
@@ -1475,6 +1497,58 @@ const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
+BPF_CALL_4(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, u32, offset)
+{
+	int err;
+
+	if (!src->data)
+		return -EINVAL;
+
+	err = bpf_dynptr_check_off_len(src, offset, len);
+	if (err)
+		return err;
+
+	memcpy(dst, src->data + src->offset + offset, len);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_read_proto = {
+	.func		= bpf_dynptr_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_DYNPTR,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len)
+{
+	int err;
+
+	if (!dst->data || bpf_dynptr_is_rdonly(dst))
+		return -EINVAL;
+
+	err = bpf_dynptr_check_off_len(dst, offset, len);
+	if (err)
+		return err;
+
+	memcpy(dst->data + dst->offset + offset, src, len);
+
+	return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_write_proto = {
+	.func		= bpf_dynptr_write,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1537,6 +1611,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_strncmp_proto;
 	case BPF_FUNC_dynptr_from_mem:
 		return &bpf_dynptr_from_mem_proto;
+	case BPF_FUNC_dynptr_read:
+		return &bpf_dynptr_read_proto;
+	case BPF_FUNC_dynptr_write:
+		return &bpf_dynptr_write_proto;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 34d4ef5775f776ec4b0d53a02d588bf3195cada6 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Mon, 23 May 2022 14:07:11 -0700
Subject: bpf: Add dynptr data slices

This patch adds a new helper function

void *bpf_dynptr_data(struct bpf_dynptr *ptr, u32 offset, u32 len);

which returns a pointer to the underlying data of a dynptr. *len*
must be a statically known value. The bpf program may access the returned
data slice as a normal buffer (eg can do direct reads and writes), since
the verifier associates the length with the returned pointer, and
enforces that no out of bounds accesses occur.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220523210712.3641569-6-joannelkoong@gmail.com
---
 kernel/bpf/helpers.c  | 28 ++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8cef3fb0d143..225806a02efb 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1549,6 +1549,32 @@ const struct bpf_func_proto bpf_dynptr_write_proto = {
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+{
+	int err;
+
+	if (!ptr->data)
+		return 0;
+
+	err = bpf_dynptr_check_off_len(ptr, offset, len);
+	if (err)
+		return 0;
+
+	if (bpf_dynptr_is_rdonly(ptr))
+		return 0;
+
+	return (unsigned long)(ptr->data + ptr->offset + offset);
+}
+
+const struct bpf_func_proto bpf_dynptr_data_proto = {
+	.func		= bpf_dynptr_data,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_DYNPTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1615,6 +1641,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_dynptr_read_proto;
 	case BPF_FUNC_dynptr_write:
 		return &bpf_dynptr_write_proto;
+	case BPF_FUNC_dynptr_data:
+		return &bpf_dynptr_data_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8be140351966..aedac2ac02b9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5832,6 +5832,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
 }
 
+static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+
+	return state->stack[spi].spilled_ptr.id;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn)
@@ -7384,6 +7392,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].id = id;
 		/* For release_reference() */
 		regs[BPF_REG_0].ref_obj_id = id;
+	} else if (func_id == BPF_FUNC_dynptr_data) {
+		int dynptr_id = 0, i;
+
+		/* Find the id of the dynptr we're acquiring a reference to */
+		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+			if (arg_type_is_dynptr(fn->arg_type[i])) {
+				if (dynptr_id) {
+					verbose(env, "verifier internal error: multiple dynptr args in func\n");
+					return -EFAULT;
+				}
+				dynptr_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
+			}
+		}
+		/* For release_reference() */
+		regs[BPF_REG_0].ref_obj_id = dynptr_id;
 	}
 
 	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
-- 
cgit v1.2.3


From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 28 Mar 2022 08:26:48 +0200
Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h

All three versions of klp_arch_set_pc() do exactly the same: they
call ftrace_instruction_pointer_set().

Call ftrace_instruction_pointer_set() directly and remove
klp_arch_set_pc().

As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h
on x86 and s390, remove asm/livepatch.h

livepatch.h remains on powerpc but its content is exclusively used
by powerpc specific code.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Petr Mladek <pmladek@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/livepatch/patch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index c172bf92b576..4c4f5a776d80 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	if (func->nop)
 		goto unlock;
 
-	klp_arch_set_pc(fregs, (unsigned long)func->new_func);
+	ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func);
 
 unlock:
 	ftrace_test_recursion_unlock(bit);
-- 
cgit v1.2.3


From 7b4537199a4a8480b8c3ba37a2d44765ce76cd9b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Fri, 13 May 2022 20:39:22 +0900
Subject: kbuild: link symbol CRCs at final link, removing
 CONFIG_MODULE_REL_CRCS

include/{linux,asm-generic}/export.h defines a weak symbol, __crc_*
as a placeholder.

Genksyms writes the version CRCs into the linker script, which will be
used for filling the __crc_* symbols. The linker script format depends
on CONFIG_MODULE_REL_CRCS. If it is enabled, __crc_* holds the offset
to the reference of CRC.

It is time to get rid of this complexity.

Now that modpost parses text files (.*.cmd) to collect all the CRCs,
it can generate C code that will be linked to the vmlinux or modules.

Generate a new C file, .vmlinux.export.c, which contains the CRCs of
symbols exported by vmlinux. It is compiled and linked to vmlinux in
scripts/link-vmlinux.sh.

Put the CRCs of symbols exported by modules into the existing *.mod.c
files. No additional build step is needed for modules. As before,
*.mod.c are compiled and linked to *.ko in scripts/Makefile.modfinal.

No linker magic is used here. The new C implementation works in the
same way, whether CONFIG_RELOCATABLE is enabled or not.
CONFIG_MODULE_REL_CRCS is no longer needed.

Previously, Kbuild invoked additional $(LD) to update the CRCs in
objects, but this step is unneeded too.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nicolas Schier <nicolas@fjasle.eu>
Reviewed-by: Nicolas Schier <nicolas@fjasle.eu>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM-14 (x86-64)
---
 kernel/module.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 6cea788fd965..c9e2342da28e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1231,11 +1231,6 @@ static int try_to_force_load(struct module *mod, const char *reason)
 
 #ifdef CONFIG_MODVERSIONS
 
-static u32 resolve_rel_crc(const s32 *crc)
-{
-	return *(u32 *)((void *)crc + *crc);
-}
-
 static int check_version(const struct load_info *info,
 			 const char *symname,
 			 struct module *mod,
@@ -1264,10 +1259,7 @@ static int check_version(const struct load_info *info,
 		if (strcmp(versions[i].name, symname) != 0)
 			continue;
 
-		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
-			crcval = resolve_rel_crc(crc);
-		else
-			crcval = *crc;
+		crcval = *crc;
 		if (versions[i].crc == crcval)
 			return 1;
 		pr_debug("Found checksum %X vs module %lX\n",
-- 
cgit v1.2.3


From eadb2f47a3ced5c64b23b90fd2a3463f63726066 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Mon, 23 May 2022 19:11:02 +0100
Subject: lockdown: also lock down previous kgdb use

KGDB and KDB allow read and write access to kernel memory, and thus
should be restricted during lockdown.  An attacker with access to a
serial port (for example, via a hypervisor console, which some cloud
vendors provide over the network) could trigger the debugger so it is
important that the debugger respect the lockdown mode when/if it is
triggered.

Fix this by integrating lockdown into kdb's existing permissions
mechanism.  Unfortunately kgdb does not have any permissions mechanism
(although it certainly could be added later) so, for now, kgdb is simply
and brutally disabled by immediately exiting the gdb stub without taking
any action.

For lockdowns established early in the boot (e.g. the normal case) then
this should be fine but on systems where kgdb has set breakpoints before
the lockdown is enacted than "bad things" will happen.

CVE: CVE-2022-21499
Co-developed-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/debug_core.c   | 24 ++++++++++++++++++
 kernel/debug/kdb/kdb_main.c | 62 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index da06a5553835..7beceb447211 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -53,6 +53,7 @@
 #include <linux/vmacache.h>
 #include <linux/rcupdate.h>
 #include <linux/irq.h>
+#include <linux/security.h>
 
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -752,6 +753,29 @@ cpu_master_loop:
 				continue;
 			kgdb_connected = 0;
 		} else {
+			/*
+			 * This is a brutal way to interfere with the debugger
+			 * and prevent gdb being used to poke at kernel memory.
+			 * This could cause trouble if lockdown is applied when
+			 * there is already an active gdb session. For now the
+			 * answer is simply "don't do that". Typically lockdown
+			 * *will* be applied before the debug core gets started
+			 * so only developers using kgdb for fairly advanced
+			 * early kernel debug can be biten by this. Hopefully
+			 * they are sophisticated enough to take care of
+			 * themselves, especially with help from the lockdown
+			 * message printed on the console!
+			 */
+			if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+				if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+					/* Switch back to kdb if possible... */
+					dbg_kdb_mode = 1;
+					continue;
+				} else {
+					/* ... otherwise just bail */
+					break;
+				}
+			}
 			error = gdb_serial_stub(ks);
 		}
 
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0852a537dad4..ead4da947127 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -45,6 +45,7 @@
 #include <linux/proc_fs.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 #include "kdb_private.h"
 
 #undef	MODULE_PARAM_PREFIX
@@ -166,10 +167,62 @@ struct task_struct *kdb_curr_task(int cpu)
 }
 
 /*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
+{
+	const int write_flags = KDB_ENABLE_MEM_WRITE |
+				KDB_ENABLE_REG_WRITE |
+				KDB_ENABLE_FLOW_CTRL;
+	const int read_flags = KDB_ENABLE_MEM_READ |
+			       KDB_ENABLE_REG_READ;
+
+	bool need_to_lockdown_write = false;
+	bool need_to_lockdown_read = false;
+
+	if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+		need_to_lockdown_write =
+			security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+	if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+		need_to_lockdown_read =
+			security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+	/* De-compose KDB_ENABLE_ALL if required */
+	if (need_to_lockdown_write || need_to_lockdown_read)
+		if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+			kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+	if (need_to_lockdown_write)
+		kdb_cmd_enabled &= ~write_flags;
+
+	if (need_to_lockdown_read)
+		kdb_cmd_enabled &= ~read_flags;
+}
+
+/*
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
  */
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
 				   bool no_args)
 {
 	/* permissions comes from userspace so needs massaging slightly */
@@ -1180,6 +1233,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
 		kdb_curr_task(raw_smp_processor_id());
 
 	KDB_DEBUG_STATE("kdb_local 1", reason);
+
+	kdb_check_for_lockdown();
+
 	kdb_go_count = 0;
 	if (reason == KDB_REASON_DEBUG) {
 		/* special case below */
-- 
cgit v1.2.3


From da007f171fc9dd993d03a360b515cac1c87921bb Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Wed, 25 May 2022 00:21:18 +0300
Subject: kernel/reboot: Change registration order of legacy power-off handler

We're unconditionally registering sys-off handler for the legacy
pm_power_off() callback, this causes problem for platforms that don't
use power-off handlers at all and should be halted. Now reboot syscall
assumes that there is a power-off handler installed and tries to power
off system instead of halting it.

To fix the trouble, move the handler's registration to the reboot syscall
and check the pm_power_off() presence.

Fixes: 0e2110d2e910 ("kernel/reboot: Add kernel_can_power_off()")
Reported-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index bdb635842ec8..d4aa372c0cdc 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -569,22 +569,6 @@ static int legacy_pm_power_off(struct sys_off_data *data)
 	return NOTIFY_DONE;
 }
 
-/*
- * Register sys-off handlers for legacy PM callbacks. This allows legacy
- * PM callbacks co-exist with the new sys-off API.
- *
- * TODO: Remove legacy handlers once all legacy PM users will be switched
- *       to the sys-off based APIs.
- */
-static int __init legacy_pm_init(void)
-{
-	register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, SYS_OFF_PRIO_DEFAULT,
-				 legacy_pm_power_off, NULL);
-
-	return 0;
-}
-core_initcall(legacy_pm_init);
-
 static void do_kernel_power_off_prepare(void)
 {
 	blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
@@ -646,6 +630,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		void __user *, arg)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
+	struct sys_off_handler *sys_off = NULL;
 	char buffer[256];
 	int ret = 0;
 
@@ -670,6 +655,21 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	if (ret)
 		return ret;
 
+	/*
+	 * Register sys-off handlers for legacy PM callback. This allows
+	 * legacy PM callbacks temporary co-exist with the new sys-off API.
+	 *
+	 * TODO: Remove legacy handlers once all legacy PM users will be
+	 *       switched to the sys-off based APIs.
+	 */
+	if (pm_power_off) {
+		sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+						   SYS_OFF_PRIO_DEFAULT,
+						   legacy_pm_power_off, NULL);
+		if (IS_ERR(sys_off))
+			return PTR_ERR(sys_off);
+	}
+
 	/* Instead of trying to make the power_off code look like
 	 * halt when pm_power_off is not set do it the easy way.
 	 */
@@ -727,6 +727,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		break;
 	}
 	mutex_unlock(&system_transition_mutex);
+	unregister_sys_off_handler(sys_off);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3159d79b56c15068aeb7e4630cd5f6dacd20fda4 Mon Sep 17 00:00:00 2001
From: Congyu Liu <liu3101@purdue.edu>
Date: Mon, 23 May 2022 05:35:31 +0000
Subject: kcov: update pos before writing pc in trace function

In __sanitizer_cov_trace_pc(), previously we write pc before updating pos.
However, some early interrupt code could bypass check_kcov_mode() check
and invoke __sanitizer_cov_trace_pc().  If such interrupt is raised
between writing pc and updating pos, the pc could be overitten by the
recursive __sanitizer_cov_trace_pc().

As suggested by Dmitry, we cold update pos before writing pc to avoid such
interleaving.

Apply the same change to write_comp_data().

Link: https://lkml.kernel.org/r/20220523053531.1572793-1-liu3101@purdue.edu
Signed-off-by: Congyu Liu <liu3101@purdue.edu>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kcov.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index b3732b210593..e19c84b02452 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void)
 	/* The first 64-bit word is the number of subsequent PCs. */
 	pos = READ_ONCE(area[0]) + 1;
 	if (likely(pos < t->kcov_size)) {
-		area[pos] = ip;
+		/* Previously we write pc before updating pos. However, some
+		 * early interrupt code could bypass check_kcov_mode() check
+		 * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+		 * raised between writing pc and updating pos, the pc could be
+		 * overitten by the recursive __sanitizer_cov_trace_pc().
+		 * Update pos before writing pc to avoid such interleaving.
+		 */
 		WRITE_ONCE(area[0], pos);
+		barrier();
+		area[pos] = ip;
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
@@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
 	start_index = 1 + count * KCOV_WORDS_PER_CMP;
 	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
 	if (likely(end_pos <= max_pos)) {
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		WRITE_ONCE(area[0], count + 1);
+		barrier();
 		area[start_index] = type;
 		area[start_index + 1] = arg1;
 		area[start_index + 2] = arg2;
 		area[start_index + 3] = ip;
-		WRITE_ONCE(area[0], count + 1);
 	}
 }
 
-- 
cgit v1.2.3


From 499f12168aebd6da8fa32c9b7d6203ca9b5eb88d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Apr 2022 14:56:32 -0400
Subject: tracing: Have event format check not flag %p* on
 __get_dynamic_array()

The print fmt check against trace events to make sure that the format does
not use pointers that may be freed from the time of the trace to the time
the event is read, gives a false positive on %pISpc when reading data that
was saved in __get_dynamic_array() when it is perfectly fine to do so, as
the data being read is on the ring buffer.

Link: https://lore.kernel.org/all/20220407144524.2a592ed6@canb.auug.org.au/

Cc: stable@vger.kernel.org
Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78f313b7b315..d5913487821a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -392,12 +392,6 @@ static void test_event_printk(struct trace_event_call *call)
 			if (!(dereference_flags & (1ULL << arg)))
 				goto next_arg;
 
-			/* Check for __get_sockaddr */;
-			if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
-				dereference_flags &= ~(1ULL << arg);
-				goto next_arg;
-			}
-
 			/* Find the REC-> in the argument */
 			c = strchr(fmt + i, ',');
 			r = strstr(fmt + i, "REC->");
@@ -413,7 +407,14 @@ static void test_event_printk(struct trace_event_call *call)
 				a = strchr(fmt + i, '&');
 				if ((a && (a < r)) || test_field(r, call))
 					dereference_flags &= ~(1ULL << arg);
+			} else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+				   (!c || r < c)) {
+				dereference_flags &= ~(1ULL << arg);
+			} else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+				   (!c || r < c)) {
+				dereference_flags &= ~(1ULL << arg);
 			}
+
 		next_arg:
 			i--;
 			arg++;
-- 
cgit v1.2.3


From e35c2d8e22745751cf304ec3fe39616643db2e0a Mon Sep 17 00:00:00 2001
From: Li Huafei <lihuafei1@huawei.com>
Date: Wed, 27 Apr 2022 11:41:19 +0800
Subject: tracing: Reset the function filter after completing trampoline/graph
 selftest

The direct trampoline and graph coexistence test sets global_ops to
trace only 'trace_selftest_dynamic_test_func', but does not reset it
after the test is completed, resulting in the function filter being set
already after the system starts. Although it can be reset through the
tracefs interface, it is more or less confusing to the user, and we
should reset it to trace all functions after the trampoline/graph test
completes.

Link: https://lkml.kernel.org/r/20220427034119.24668-1-lihuafei1@huawei.com
Link: https://lore.kernel.org/all/20220418073958.104029-1-lihuafei1@huawei.com/

Fixes: 130c08065848 ("tracing: Add trampoline/graph selftest")
Signed-off-by: Li Huafei <lihuafei1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index abcadbe933bb..a2d301f58ced 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 		ret = -1;
 		goto out;
 	}
+
+	/* Enable tracing on all functions again */
+	ftrace_set_global_filter(NULL, 0, 1);
 #endif
 
 	/* Don't test dynamic tracing, the function tracer already did */
-- 
cgit v1.2.3


From e4931b824a6f36cae9cc5632709f015cfa748a25 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Fri, 14 Jan 2022 21:10:52 +0800
Subject: tracing: Use trace_create_file() to simplify creation of tracefs
 entries

Creating tracefs entries with tracefs_create_file() followed by pr_warn()
is tedious and repetitive, we can use trace_create_file() to simplify
this process and make the code more readable.

Link: https://lkml.kernel.org/r/20220114131052.534382-1-ytcoode@gmail.com

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                 |  9 +++------
 kernel/trace/trace_dynevent.c         |  9 ++-------
 kernel/trace/trace_events.c           | 29 ++++++++++-------------------
 kernel/trace/trace_kprobe.c           | 15 ++++-----------
 kernel/trace/trace_recursion_record.c |  7 ++-----
 5 files changed, 21 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5c465e70d146..6c5b12669e06 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -952,7 +952,6 @@ static struct tracer_stat function_stats __initdata = {
 static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 {
 	struct ftrace_profile_stat *stat;
-	struct dentry *entry;
 	char *name;
 	int ret;
 	int cpu;
@@ -983,11 +982,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
 		}
 	}
 
-	entry = tracefs_create_file("function_profile_enabled",
-				    TRACE_MODE_WRITE, d_tracer, NULL,
-				    &ftrace_profile_fops);
-	if (!entry)
-		pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
+	trace_create_file("function_profile_enabled",
+			  TRACE_MODE_WRITE, d_tracer, NULL,
+			  &ftrace_profile_fops);
 }
 
 #else /* CONFIG_FUNCTION_PROFILER */
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index e34e8182ee4b..076b447a1b88 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = {
 /* Make a tracefs interface for controlling dynamic events */
 static __init int init_dynamic_event(void)
 {
-	struct dentry *entry;
 	int ret;
 
 	ret = tracing_init_dentry();
 	if (ret)
 		return 0;
 
-	entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
-				    NULL, &dynamic_events_ops);
-
-	/* Event list interface */
-	if (!entry)
-		pr_warn("Could not create tracefs 'dynamic_events' entry\n");
+	trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL,
+			  NULL, &dynamic_events_ops);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d5913487821a..9a88de9d7815 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3546,12 +3546,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 	struct dentry *d_events;
 	struct dentry *entry;
 
-	entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent,
-				    tr, &ftrace_set_event_fops);
-	if (!entry) {
-		pr_warn("Could not create tracefs 'set_event' entry\n");
+	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+				  tr, &ftrace_set_event_fops);
+	if (!entry)
 		return -ENOMEM;
-	}
 
 	d_events = tracefs_create_dir("events", parent);
 	if (!d_events) {
@@ -3566,16 +3564,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 
 	/* There are not as crucial, just warn if they are not created */
 
-	entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
-				    tr, &ftrace_set_event_pid_fops);
-	if (!entry)
-		pr_warn("Could not create tracefs 'set_event_pid' entry\n");
+	trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+			  tr, &ftrace_set_event_pid_fops);
 
-	entry = tracefs_create_file("set_event_notrace_pid",
-				    TRACE_MODE_WRITE, parent, tr,
-				    &ftrace_set_event_notrace_pid_fops);
-	if (!entry)
-		pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n");
+	trace_create_file("set_event_notrace_pid",
+			  TRACE_MODE_WRITE, parent, tr,
+			  &ftrace_set_event_notrace_pid_fops);
 
 	/* ring buffer internal formats */
 	trace_create_file("header_page", TRACE_MODE_READ, d_events,
@@ -3790,17 +3784,14 @@ static __init int event_trace_init_fields(void)
 __init int event_trace_init(void)
 {
 	struct trace_array *tr;
-	struct dentry *entry;
 	int ret;
 
 	tr = top_trace_array();
 	if (!tr)
 		return -ENODEV;
 
-	entry = tracefs_create_file("available_events", TRACE_MODE_READ,
-				    NULL, tr, &ftrace_avail_fops);
-	if (!entry)
-		pr_warn("Could not create tracefs 'available_events' entry\n");
+	trace_create_file("available_events", TRACE_MODE_READ,
+			  NULL, tr, &ftrace_avail_fops);
 
 	ret = early_event_add_tracer(NULL, tr);
 	if (ret)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 47cebef78532..93507330462c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1907,25 +1907,18 @@ core_initcall(init_kprobe_trace_early);
 static __init int init_kprobe_trace(void)
 {
 	int ret;
-	struct dentry *entry;
 
 	ret = tracing_init_dentry();
 	if (ret)
 		return 0;
 
-	entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE,
-				    NULL, NULL, &kprobe_events_ops);
-
 	/* Event list interface */
-	if (!entry)
-		pr_warn("Could not create tracefs 'kprobe_events' entry\n");
+	trace_create_file("kprobe_events", TRACE_MODE_WRITE,
+			  NULL, NULL, &kprobe_events_ops);
 
 	/* Profile interface */
-	entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ,
-				    NULL, NULL, &kprobe_profile_ops);
-
-	if (!entry)
-		pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
+	trace_create_file("kprobe_profile", TRACE_MODE_READ,
+			  NULL, NULL, &kprobe_profile_ops);
 
 	setup_boot_kprobe_events();
 
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index 4d4b78c8ca25..a520b11afb0d 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = {
 
 __init static int create_recursed_functions(void)
 {
-	struct dentry *dentry;
 
-	dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE,
-				   NULL, NULL, &recursed_functions_fops);
-	if (!dentry)
-		pr_warn("WARNING: Failed to create recursed_functions\n");
+	trace_create_file("recursed_functions", TRACE_MODE_WRITE,
+			  NULL, NULL, &recursed_functions_fops);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2889c658b2fbc7ad4c5d541734fdc1d97b130753 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Thu, 20 Jan 2022 06:59:49 +0000
Subject: ftrace: Deal with error return code of the ftrace_process_locs()
 function

The ftrace_process_locs() function may return -ENOMEM error code, which
should be handled by the callers.

Link: https://lkml.kernel.org/r/20220120065949.1813231-1-ytcoode@gmail.com

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c5b12669e06..737e895e2aad 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6864,11 +6864,16 @@ void ftrace_module_enable(struct module *mod)
 
 void ftrace_module_init(struct module *mod)
 {
+	int ret;
+
 	if (ftrace_disabled || !mod->num_ftrace_callsites)
 		return;
 
-	ftrace_process_locs(mod, mod->ftrace_callsites,
-			    mod->ftrace_callsites + mod->num_ftrace_callsites);
+	ret = ftrace_process_locs(mod, mod->ftrace_callsites,
+				  mod->ftrace_callsites + mod->num_ftrace_callsites);
+	if (ret)
+		pr_warn("ftrace: failed to allocate entries for module '%s' functions\n",
+			mod->name);
 }
 
 static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
@@ -7201,15 +7206,19 @@ void __init ftrace_init(void)
 	pr_info("ftrace: allocating %ld entries in %ld pages\n",
 		count, count / ENTRIES_PER_PAGE + 1);
 
-	last_ftrace_enabled = ftrace_enabled = 1;
-
 	ret = ftrace_process_locs(NULL,
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
+	if (ret) {
+		pr_warn("ftrace: failed to allocate entries for functions\n");
+		goto failed;
+	}
 
 	pr_info("ftrace: allocated %ld pages with %ld groups\n",
 		ftrace_number_of_pages, ftrace_number_of_groups);
 
+	last_ftrace_enabled = ftrace_enabled = 1;
+
 	set_ftrace_early_filters();
 
 	return;
-- 
cgit v1.2.3


From cb24693d94ceaf658944ad2e922203c0503775d2 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Fri, 21 Jan 2022 09:56:23 +0000
Subject: tracing: Use strim() to remove whitespace instead of doing it
 manually

The tracing_set_trace_write() function just removes the trailing whitespace
from the user supplied tracer name, but the leading whitespace should also
be removed.

In addition, if the user supplied tracer name contains only a few
whitespace characters, the first one will not be removed using the current
method, which results it a single whitespace character left in the buf.

To fix all of these issues, we use strim() to correctly remove both the
leading and trailing whitespace.

Link: https://lkml.kernel.org/r/20220121095623.1826679-1-ytcoode@gmail.com

Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 087740f63d92..498ae22d4ffa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6461,7 +6461,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	char buf[MAX_TRACER_SIZE+1];
-	int i;
+	char *name;
 	size_t ret;
 	int err;
 
@@ -6475,11 +6475,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	/* strip ending whitespace. */
-	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-		buf[i] = 0;
+	name = strim(buf);
 
-	err = tracing_set_tracer(tr, buf);
+	err = tracing_set_tracer(tr, name);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 99696a2592bca641eb88cc9a80c90e591afebd0f Mon Sep 17 00:00:00 2001
From: Keita Suzuki <keitasuzuki.park@sslab.ics.keio.ac.jp>
Date: Mon, 25 Apr 2022 06:37:38 +0000
Subject: tracing: Fix potential double free in create_var_ref()

In create_var_ref(), init_var_ref() is called to initialize the fields
of variable ref_field, which is allocated in the previous function call
to create_hist_field(). Function init_var_ref() allocates the
corresponding fields such as ref_field->system, but frees these fields
when the function encounters an error. The caller later calls
destroy_hist_field() to conduct error handling, which frees the fields
and the variable itself. This results in double free of the fields which
are already freed in the previous function.

Fix this by storing NULL to the corresponding fields when they are freed
in init_var_ref().

Link: https://lkml.kernel.org/r/20220425063739.3859998-1-keitasuzuki.park@sslab.ics.keio.ac.jp

Fixes: 067fe038e70f ("tracing: Add variable reference handling to hist triggers")
CC: stable@vger.kernel.org
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Keita Suzuki <keitasuzuki.park@sslab.ics.keio.ac.jp>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 038dc591545d..c6a65738feb3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2093,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field,
 	return err;
  free:
 	kfree(ref_field->system);
+	ref_field->system = NULL;
 	kfree(ref_field->event_name);
+	ref_field->event_name = NULL;
 	kfree(ref_field->name);
+	ref_field->name = NULL;
 
 	goto out;
 }
-- 
cgit v1.2.3


From b27f266f74fbda4ee36c2b2b04d15992860cf23b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <vvghjk1234@gmail.com>
Date: Tue, 3 May 2022 14:05:46 +0900
Subject: tracing: Fix return value of trace_pid_write()

Setting set_event_pid with trailing whitespace lead to endless write
system calls like below.

    $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid
    execve("/usr/bin/echo", ["echo", "123 "], ...) = 0
    ...
    write(1, "123 \n", 5)                   = 4
    write(1, "\n", 1)                       = 0
    write(1, "\n", 1)                       = 0
    write(1, "\n", 1)                       = 0
    write(1, "\n", 1)                       = 0
    write(1, "\n", 1)                       = 0
    ....

This is because, the result of trace_get_user's are not returned when it
read at least one pid. To fix it, update read variable even if
parser->idx == 0.

The result of applied patch is below.

    $ strace echo "123 " > /sys/kernel/debug/tracing/set_event_pid
    execve("/usr/bin/echo", ["echo", "123 "], ...) = 0
    ...
    write(1, "123 \n", 5)                   = 5
    close(1)                                = 0

Link: https://lkml.kernel.org/r/20220503050546.288911-1-vvghjk1234@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Baik Song An <bsahn@etri.re.kr>
Cc: Hong Yeon Kim <kimhy@etri.re.kr>
Cc: Taeung Song <taeung@reallinux.co.kr>
Cc: linuxgeek@linuxgeek.io
Cc: stable@vger.kernel.org
Fixes: 4909010788640 ("tracing: Add set_event_pid directory for future use")
Signed-off-by: Wonhyuk Yang <vvghjk1234@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 498ae22d4ffa..4825883b2ffd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -721,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 		pos = 0;
 
 		ret = trace_get_user(&parser, ubuf, cnt, &pos);
-		if (ret < 0 || !trace_parser_loaded(&parser))
+		if (ret < 0)
 			break;
 
 		read += ret;
 		ubuf += ret;
 		cnt -= ret;
 
+		if (!trace_parser_loaded(&parser))
+			break;
+
 		ret = -EINVAL;
 		if (kstrtoul(parser.buffer, 0, &val))
 			break;
@@ -753,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 	if (!nr_pids) {
 		/* Cleared the list of pids */
 		trace_pid_list_free(pid_list);
-		read = ret;
 		pid_list = NULL;
 	}
 
-- 
cgit v1.2.3


From 43994049180704fd1faf78623fabd9a5cd443708 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 4 May 2022 12:36:31 +0900
Subject: kprobes: Fix build errors with CONFIG_KRETPROBES=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Max Filippov reported:

When building kernel with CONFIG_KRETPROBES=n kernel/kprobes.c
compilation fails with the following messages:

  kernel/kprobes.c: In function ‘recycle_rp_inst’:
  kernel/kprobes.c:1273:32: error: implicit declaration of function
                                   ‘get_kretprobe’

  kernel/kprobes.c: In function ‘kprobe_flush_task’:
  kernel/kprobes.c:1299:35: error: ‘struct task_struct’ has no member
                                   named ‘kretprobe_instances’

This came from the commit d741bf41d7c7 ("kprobes: Remove
kretprobe hash") which introduced get_kretprobe() and
kretprobe_instances member in task_struct when CONFIG_KRETPROBES=y,
but did not make recycle_rp_inst() and kprobe_flush_task()
depending on CONFIG_KRETPORBES.

Since those functions are only used for kretprobe, move those
functions into #ifdef CONFIG_KRETPROBE area.

Link: https://lkml.kernel.org/r/165163539094.74407.3838114721073251225.stgit@devnote2

Reported-by: Max Filippov <jcmvbkbc@gmail.com>
Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash")
Cc: "Naveen N . Rao" <naveen.n.rao@linux.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S . Miller" <davem@davemloft.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Tested-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/kprobes.c | 144 +++++++++++++++++++++++++++----------------------------
 1 file changed, 71 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dbe57df2e199..172ba75a4a49 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1257,79 +1257,6 @@ void kprobe_busy_end(void)
 	preempt_enable();
 }
 
-#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
-static void free_rp_inst_rcu(struct rcu_head *head)
-{
-	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
-
-	if (refcount_dec_and_test(&ri->rph->ref))
-		kfree(ri->rph);
-	kfree(ri);
-}
-NOKPROBE_SYMBOL(free_rp_inst_rcu);
-
-static void recycle_rp_inst(struct kretprobe_instance *ri)
-{
-	struct kretprobe *rp = get_kretprobe(ri);
-
-	if (likely(rp))
-		freelist_add(&ri->freelist, &rp->freelist);
-	else
-		call_rcu(&ri->rcu, free_rp_inst_rcu);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
-/*
- * This function is called from delayed_put_task_struct() when a task is
- * dead and cleaned up to recycle any kretprobe instances associated with
- * this task. These left over instances represent probed functions that
- * have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
-	struct kretprobe_instance *ri;
-	struct llist_node *node;
-
-	/* Early boot, not yet initialized. */
-	if (unlikely(!kprobes_initialized))
-		return;
-
-	kprobe_busy_begin();
-
-	node = __llist_del_all(&tk->kretprobe_instances);
-	while (node) {
-		ri = container_of(node, struct kretprobe_instance, llist);
-		node = node->next;
-
-		recycle_rp_inst(ri);
-	}
-
-	kprobe_busy_end();
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
-	struct kretprobe_instance *ri;
-	struct freelist_node *node;
-	int count = 0;
-
-	node = rp->freelist.head;
-	while (node) {
-		ri = container_of(node, struct kretprobe_instance, freelist);
-		node = node->next;
-
-		kfree(ri);
-		count++;
-	}
-
-	if (refcount_sub_and_test(count, &rp->rph->ref)) {
-		kfree(rp->rph);
-		rp->rph = NULL;
-	}
-}
-#endif	/* !CONFIG_KRETPROBE_ON_RETHOOK */
-
 /* Add the new probe to 'ap->list'. */
 static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
@@ -1928,6 +1855,77 @@ static struct notifier_block kprobe_exceptions_nb = {
 #ifdef CONFIG_KRETPROBES
 
 #if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+static void free_rp_inst_rcu(struct rcu_head *head)
+{
+	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+
+	if (refcount_dec_and_test(&ri->rph->ref))
+		kfree(ri->rph);
+	kfree(ri);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
+
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+	struct kretprobe *rp = get_kretprobe(ri);
+
+	if (likely(rp))
+		freelist_add(&ri->freelist, &rp->freelist);
+	else
+		call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+	struct kretprobe_instance *ri;
+	struct llist_node *node;
+
+	/* Early boot, not yet initialized. */
+	if (unlikely(!kprobes_initialized))
+		return;
+
+	kprobe_busy_begin();
+
+	node = __llist_del_all(&tk->kretprobe_instances);
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, llist);
+		node = node->next;
+
+		recycle_rp_inst(ri);
+	}
+
+	kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+	struct kretprobe_instance *ri;
+	struct freelist_node *node;
+	int count = 0;
+
+	node = rp->freelist.head;
+	while (node) {
+		ri = container_of(node, struct kretprobe_instance, freelist);
+		node = node->next;
+
+		kfree(ri);
+		count++;
+	}
+
+	if (refcount_sub_and_test(count, &rp->rph->ref)) {
+		kfree(rp->rph);
+		rp->rph = NULL;
+	}
+}
+
 /* This assumes the 'tsk' is the current task or the is not running. */
 static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
 						  struct llist_node **cur)
-- 
cgit v1.2.3


From aa748949b4e665f473bc5abdc5f66029cb5f5522 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 10 May 2022 11:45:23 +0200
Subject: tracing/timerlat: Notify IRQ new max latency only if stop tracing is
 set

Currently, the notification of a new max latency is sent from
timerlat's IRQ handler anytime a new max latency is found.

While this behavior is not wrong, the send IPI overhead itself
will increase the thread latency and that is not the desired
effect (tracing overhead).

Moreover, the thread will notify a new max latency again because
the thread latency as it is always higher than the IRQ latency
that woke it up.

The only case in which it is helpful to notify a new max latency
from IRQ is when stop tracing (for the IRQ) is set, as in this
case, the thread will not be dispatched.

Notify a new max latency from the IRQ handler only if stop tracing is
set for the IRQ handler.

Link: https://lkml.kernel.org/r/2c2d9a56c0886c8402ba320de32856cbbb10c2bb.1652175637.git.bristot@kernel.org

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Reported-by: Clark Williams <williams@redhat.com>
Fixes: a955d7eac177 ("trace: Add timerlat tracer")
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e9ae1f33a7f0..6494ca27ea6f 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1578,11 +1578,12 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
 
 	trace_timerlat_sample(&s);
 
-	notify_new_max_latency(diff);
-
-	if (osnoise_data.stop_tracing)
-		if (time_to_us(diff) >= osnoise_data.stop_tracing)
+	if (osnoise_data.stop_tracing) {
+		if (time_to_us(diff) >= osnoise_data.stop_tracing) {
 			osnoise_stop_tracing();
+			notify_new_max_latency(diff);
+		}
+	}
 
 	wake_up_process(tlat->kthread);
 
-- 
cgit v1.2.3


From 4dd2aea24ed7613735664feadc9879d37f718c23 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 10 May 2022 11:45:24 +0200
Subject: tracing/timerlat: Print stacktrace in the IRQ handler if needed

If print_stack and stop_tracing_us are set, and stop_tracing_us is hit
with latency higher than or equal to print_stack, print the
stack at the IRQ handler as it is useful to define the root cause for
the IRQ latency.

Link: https://lkml.kernel.org/r/fd04530ce98ae9270e41bb124ee5bf67b05ecfed.1652175637.git.bristot@kernel.org

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 6494ca27ea6f..9b204ee3c6f5 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1580,6 +1580,19 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
 
 	if (osnoise_data.stop_tracing) {
 		if (time_to_us(diff) >= osnoise_data.stop_tracing) {
+
+			/*
+			 * At this point, if stop_tracing is set and <= print_stack,
+			 * print_stack is set and would be printed in the thread handler.
+			 *
+			 * Thus, print the stack trace as it is helpful to define the
+			 * root cause of an IRQ latency.
+			 */
+			if (osnoise_data.stop_tracing <= osnoise_data.print_stack) {
+				timerlat_save_stack(0);
+				timerlat_dump_stack(time_to_us(diff));
+			}
+
 			osnoise_stop_tracing();
 			notify_new_max_latency(diff);
 		}
-- 
cgit v1.2.3


From 9c556e5a4dd5cfc0939a0575577d0517118f98af Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Tue, 10 May 2022 11:45:25 +0200
Subject: tracing/timerlat: Do not wakeup the thread if the trace stops at the
 IRQ

There is no need to wakeup the timerlat/ thread if stop tracing is hit
at the timerlat's IRQ handler.

Return before waking up timerlat's thread.

Link: https://lkml.kernel.org/r/b392356c91b56aedd2b289513cc56a84cf87e60d.1652175637.git.bristot@kernel.org

Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 9b204ee3c6f5..035ec8b84e12 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1595,6 +1595,8 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
 
 			osnoise_stop_tracing();
 			notify_new_max_latency(diff);
+
+			return HRTIMER_NORESTART;
 		}
 	}
 
-- 
cgit v1.2.3


From 2d601b98643dd2846e2958d931826e7b7af44969 Mon Sep 17 00:00:00 2001
From: liqiong <liqiong@nfschina.com>
Date: Thu, 12 May 2022 22:32:30 +0800
Subject: tracing: Change "char *" string form to "char []"

The "char []" string form declares a single variable. It is better
than "char *" which creates two variables in the final assembly.

Link: https://lkml.kernel.org/r/20220512143230.28796-1-liqiong@nfschina.com

Signed-off-by: liqiong <liqiong@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_events_hist.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4825883b2ffd..e38a7ca4cdd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4252,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file
 				       unsigned int flags)
 {
 	bool tgid = flags & TRACE_ITER_RECORD_TGID;
-	const char *space = "            ";
+	static const char space[] = "            ";
 	int prec = tgid ? 12 : 2;
 
 	print_event_info(buf, m);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c6a65738feb3..48e82e141d54 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4165,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 	return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
 }
 
-static const char *no_comm = "(no comm)";
+static const char no_comm[] = "(no comm)";
 
 static u64 hist_field_execname(struct hist_field *hist_field,
 			       struct tracing_map_elt *elt,
-- 
cgit v1.2.3


From 2decd16f47e3df3234b5486fe89a9aa5a1102af1 Mon Sep 17 00:00:00 2001
From: liqiong <liqiong@nfschina.com>
Date: Fri, 13 May 2022 15:52:21 +0800
Subject: tracing: Cleanup code by removing init "char *name"

The pointer is assigned to "type->name" anyway. no need to
initialize with "preemption".

Link: https://lkml.kernel.org/r/20220513075221.26275-1-liqiong@nfschina.com

Signed-off-by: liqiong <liqiong@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e38a7ca4cdd0..f400800bc910 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4276,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	struct tracer *type = iter->trace;
 	unsigned long entries;
 	unsigned long total;
-	const char *name = "preemption";
-
-	name = type->name;
+	const char *name = type->name;
 
 	get_total_entries(buf, &total, &entries);
 
-- 
cgit v1.2.3


From 3a2bfec0b02f2226ff3376a5d2ff604d799bd7ea Mon Sep 17 00:00:00 2001
From: Li kunyu <kunyu@nfschina.com>
Date: Wed, 18 May 2022 10:36:40 +0800
Subject: ftrace: Remove return value of ftrace_arch_modify_*()

All instances of the function ftrace_arch_modify_prepare() and
  ftrace_arch_modify_post_process() return zero. There's no point in
  checking their return value. Just have them be void functions.

Link: https://lkml.kernel.org/r/20220518023639.4065-1-kunyu@nfschina.com

Signed-off-by: Li kunyu <kunyu@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 737e895e2aad..852c731cbd9a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2704,18 +2704,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
  * archs can override this function if they must do something
  * before the modifying code is performed.
  */
-int __weak ftrace_arch_code_modify_prepare(void)
+void __weak ftrace_arch_code_modify_prepare(void)
 {
-	return 0;
 }
 
 /*
  * archs can override this function if they must do something
  * after the modifying code is performed.
  */
-int __weak ftrace_arch_code_modify_post_process(void)
+void __weak ftrace_arch_code_modify_post_process(void)
 {
-	return 0;
 }
 
 void ftrace_modify_all_code(int command)
@@ -2801,12 +2799,7 @@ void __weak arch_ftrace_update_code(int command)
 
 static void ftrace_run_update_code(int command)
 {
-	int ret;
-
-	ret = ftrace_arch_code_modify_prepare();
-	FTRACE_WARN_ON(ret);
-	if (ret)
-		return;
+	ftrace_arch_code_modify_prepare();
 
 	/*
 	 * By default we use stop_machine() to modify the code.
@@ -2816,8 +2809,7 @@ static void ftrace_run_update_code(int command)
 	 */
 	arch_ftrace_update_code(command);
 
-	ret = ftrace_arch_code_modify_post_process();
-	FTRACE_WARN_ON(ret);
+	ftrace_arch_code_modify_post_process();
 }
 
 static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
-- 
cgit v1.2.3


From 50c697819d59c5013a66728940015348919a0c0c Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:11:31 +0200
Subject: ftrace: Fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Link: https://lkml.kernel.org/r/20220521111145.81697-81-Julia.Lawall@inria.fr

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 852c731cbd9a..fb8f08b4bd41 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -119,7 +119,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 struct ftrace_ops global_ops;
 
-/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */
+/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */
 void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *op, struct ftrace_regs *fregs);
 
-- 
cgit v1.2.3


From 154827f8e53d8c492b3fb0cb757fbcadb5d516b5 Mon Sep 17 00:00:00 2001
From: Gautam Menghani <gautammenghani201@gmail.com>
Date: Sat, 21 May 2022 23:18:26 -0700
Subject: tracing: Initialize integer variable to prevent garbage return value

Initialize the integer variable to 0 to fix the clang scan warning:
Undefined or garbage value returned to caller
[core.uninitialized.UndefReturn]
        return ret;

Link: https://lkml.kernel.org/r/20220522061826.1751-1-gautammenghani201@gmail.com

Cc: stable@vger.kernel.org
Fixes: 8993665abcce ("tracing/boot: Support multiple handlers for per-event histogram")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Gautam Menghani <gautammenghani201@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_boot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0580287d7a0d..778200dd8ede 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
 {
 	struct xbc_node *node;
 	const char *p, *handler;
-	int ret;
+	int ret = 0;
 
 	handler = xbc_node_get_data(hnode);
 
-- 
cgit v1.2.3


From bb5eb8f3b329789fbf22c85328dcf696a3e97ffb Mon Sep 17 00:00:00 2001
From: Congyu Liu <liu3101@purdue.edu>
Date: Mon, 23 May 2022 06:30:33 +0000
Subject: tracing: Disable kcov on trace_preemptirq.c

Functions in trace_preemptirq.c could be invoked from early interrupt
code that bypasses kcov trace function's in_task() check. Disable kcov
on this file to reduce random code coverage.

Link: https://lkml.kernel.org/r/20220523063033.1778974-1-liu3101@purdue.edu

Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Congyu Liu <liu3101@purdue.edu>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d77cd8032213..0d261774d6f3 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE
 GCOV_PROFILE := y
 endif
 
+# Functions in this file could be invoked from early interrupt
+# code and produce random code coverage.
+KCOV_INSTRUMENT_trace_preemptirq.o := n
+
 CFLAGS_bpf_trace.o := -I$(src)
 
 CFLAGS_trace_benchmark.o := -I$(src)
-- 
cgit v1.2.3


From 0a54f556b035e5217e21724d82dd98ce695bd6d6 Mon Sep 17 00:00:00 2001
From: sunliming <sunliming@kylinos.cn>
Date: Tue, 24 May 2022 14:39:37 +0800
Subject: tracing: Fix comments of create_filter()

The name in comments of parameter "filter_string" in function
create_filter is annotated as "filter_str", just fix it.

Link: https://lkml.kernel.org/r/20220524063937.52873-1-sunliming@kylinos.cn

Signed-off-by: sunliming <sunliming@kylinos.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b458a9afa2c0..4b1057ab9d96 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe)
  * create_filter - create a filter for a trace_event_call
  * @tr: the trace array associated with these events
  * @call: trace_event_call to create a filter for
- * @filter_str: filter string
+ * @filter_string: filter string
  * @set_str: remember @filter_str and enable detailed error in filter
  * @filterp: out param for created filter (always updated on return)
  *           Must be a pointer that references a NULL pointer.
-- 
cgit v1.2.3


From 7d54c15cb89a29a5f59e5ffc9ee62e6591769ef1 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Tue, 24 May 2022 10:08:39 -0700
Subject: ftrace: Clean up hash direct_functions on register failures

We see the following GPF when register_ftrace_direct fails:

[ ] general protection fault, probably for non-canonical address \
  0x200000000000010: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
[...]
[ ] RIP: 0010:ftrace_find_rec_direct+0x53/0x70
[ ] Code: 48 c1 e0 03 48 03 42 08 48 8b 10 31 c0 48 85 d2 74 [...]
[ ] RSP: 0018:ffffc9000138bc10 EFLAGS: 00010206
[ ] RAX: 0000000000000000 RBX: ffffffff813e0df0 RCX: 000000000000003b
[ ] RDX: 0200000000000000 RSI: 000000000000000c RDI: ffffffff813e0df0
[ ] RBP: ffffffffa00a3000 R08: ffffffff81180ce0 R09: 0000000000000001
[ ] R10: ffffc9000138bc18 R11: 0000000000000001 R12: ffffffff813e0df0
[ ] R13: ffffffff813e0df0 R14: ffff888171b56400 R15: 0000000000000000
[ ] FS:  00007fa9420c7780(0000) GS:ffff888ff6a00000(0000) knlGS:000000000
[ ] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ ] CR2: 000000000770d000 CR3: 0000000107d50003 CR4: 0000000000370ee0
[ ] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ ] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ ] Call Trace:
[ ]  <TASK>
[ ]  register_ftrace_direct+0x54/0x290
[ ]  ? render_sigset_t+0xa0/0xa0
[ ]  bpf_trampoline_update+0x3f5/0x4a0
[ ]  ? 0xffffffffa00a3000
[ ]  bpf_trampoline_link_prog+0xa9/0x140
[ ]  bpf_tracing_prog_attach+0x1dc/0x450
[ ]  bpf_raw_tracepoint_open+0x9a/0x1e0
[ ]  ? find_held_lock+0x2d/0x90
[ ]  ? lock_release+0x150/0x430
[ ]  __sys_bpf+0xbd6/0x2700
[ ]  ? lock_is_held_type+0xd8/0x130
[ ]  __x64_sys_bpf+0x1c/0x20
[ ]  do_syscall_64+0x3a/0x80
[ ]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[ ] RIP: 0033:0x7fa9421defa9
[ ] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 9 f8 [...]
[ ] RSP: 002b:00007ffed743bd78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
[ ] RAX: ffffffffffffffda RBX: 00000000069d2480 RCX: 00007fa9421defa9
[ ] RDX: 0000000000000078 RSI: 00007ffed743bd80 RDI: 0000000000000011
[ ] RBP: 00007ffed743be00 R08: 0000000000bb7270 R09: 0000000000000000
[ ] R10: 00000000069da210 R11: 0000000000000246 R12: 0000000000000001
[ ] R13: 00007ffed743c4b0 R14: 00000000069d2480 R15: 0000000000000001
[ ]  </TASK>
[ ] Modules linked in: klp_vm(OK)
[ ] ---[ end trace 0000000000000000 ]---

One way to trigger this is:
  1. load a livepatch that patches kernel function xxx;
  2. run bpftrace -e 'kfunc:xxx {}', this will fail (expected for now);
  3. repeat #2 => gpf.

This is because the entry is added to direct_functions, but not removed.
Fix this by remove the entry from direct_functions when
register_ftrace_direct fails.

Also remove the last trailing space from ftrace.c, so we don't have to
worry about it anymore.

Link: https://lkml.kernel.org/r/20220524170839.900849-1-song@kernel.org

Cc: stable@vger.kernel.org
Fixes: 763e34e74bb7 ("ftrace: Add register_ftrace_direct()")
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fb8f08b4bd41..d653ef4febc5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4454,7 +4454,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
  * @ip: The instruction pointer address to remove the data from
  *
  * Returns the data if it is found, otherwise NULL.
- * Note, if the data pointer is used as the data itself, (see 
+ * Note, if the data pointer is used as the data itself, (see
  * ftrace_func_mapper_find_ip(), then the return value may be meaningless,
  * if the data pointer was set to zero.
  */
@@ -5188,8 +5188,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 		goto out_unlock;
 
 	ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
-	if (ret)
-		remove_hash_entry(direct_functions, entry);
 
 	if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
 		ret = register_ftrace_function(&direct_ops);
@@ -5198,6 +5196,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 	}
 
 	if (ret) {
+		remove_hash_entry(direct_functions, entry);
 		kfree(entry);
 		if (!direct->count) {
 			list_del_rcu(&direct->next);
-- 
cgit v1.2.3


From 8d4a21b5ac9dad5d5221c60060b3ac28d22f57ca Mon Sep 17 00:00:00 2001
From: sunliming <sunliming@kylinos.cn>
Date: Thu, 26 May 2022 15:29:57 +0800
Subject: tracing: Fix comments for event_trigger_separate_filter()

The parameter name in comments of event_trigger_separate_filter() is
inconsistent with actual parameter name, fix it.

Link: https://lkml.kernel.org/r/20220526072957.165655-1-sunliming@kylinos.cn

Signed-off-by: sunliming <sunliming@kylinos.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_trigger.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 21592bed2e89..cb866c3141af 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -738,15 +738,15 @@ bool event_trigger_empty_param(const char *param)
 
 /**
  * event_trigger_separate_filter - separate an event trigger from a filter
- * @param: The param string containing trigger and possibly filter
- * @trigger: outparam, will be filled with a pointer to the trigger
+ * @param_and_filter: String containing trigger and possibly filter
+ * @param: outparam, will be filled with a pointer to the trigger
  * @filter: outparam, will be filled with a pointer to the filter
  * @param_required: Specifies whether or not the param string is required
  *
  * Given a param string of the form '[trigger] [if filter]', this
  * function separates the filter from the trigger and returns the
- * trigger in *trigger and the filter in *filter.  Either the *trigger
- * or the *filter may be set to NULL by this function - if not set to
+ * trigger in @param and the filter in @filter.  Either the @param
+ * or the @filter may be set to NULL by this function - if not set to
  * NULL, they will contain strings corresponding to the trigger and
  * filter.
  *
-- 
cgit v1.2.3


From 8b4dd2d8627e88dc3bd71bf29c48aaae2b69572b Mon Sep 17 00:00:00 2001
From: Haowen Bai <baihaowen@meizu.com>
Date: Fri, 27 May 2022 18:03:54 +0800
Subject: perf/core: Remove unused local variable

Drop LIST_HEAD() where the variable it declares is never used.

Compiler probably never warned us, because the LIST_HEAD()
initializer is technically 'usage'.

[ mingo: Tweak changelog. ]

Signed-off-by: Haowen Bai <baihaowen@meizu.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/1653645835-29206-1-git-send-email-baihaowen@meizu.com
---
 kernel/events/core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 950b25c3f210..80782cddb1da 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4257,7 +4257,6 @@ static void perf_event_remove_on_exec(int ctxn)
 {
 	struct perf_event_context *ctx, *clone_ctx = NULL;
 	struct perf_event *event, *next;
-	LIST_HEAD(free_list);
 	unsigned long flags;
 	bool modified = false;
 
-- 
cgit v1.2.3


From 809631e2bff5aba056df75cc4e43127dbd0278c9 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 26 May 2022 22:36:56 +0206
Subject: Revert "printk: wake up all waiters"

This reverts commit 938ba4084abcf6fdd21d9078513c52f8fb9b00d0.

The wait queue @log_wait never has exclusive waiters, so there
is no need to use wake_up_interruptible_all(). Using
wake_up_interruptible() was the correct function to wake all
waiters.

Since there are no exclusive waiters, erroneously changing
wake_up_interruptible() to wake_up_interruptible_all() did not
result in any behavior change. However, using
wake_up_interruptible_all() on a wait queue without exclusive
waiters is fundamentally wrong.

Go back to using wake_up_interruptible() to wake all waiters.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220526203056.81123-1-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a3e1035929b0..ea3dd55709e7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3904,7 +3904,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
-		wake_up_interruptible_all(&log_wait);
+		wake_up_interruptible(&log_wait);
 }
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
-- 
cgit v1.2.3


From 3e35142ef99fe6b4fe5d834ad43ee13cca10a2dc Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 19 May 2022 14:42:37 +0530
Subject: kexec_file: drop weak attribute from
 arch_kexec_apply_relocations[_add]

Since commit d1bcae833b32f1 ("ELF: Don't generate unused section
symbols") [1], binutils (v2.36+) started dropping section symbols that
it thought were unused.  This isn't an issue in general, but with
kexec_file.c, gcc is placing kexec_arch_apply_relocations[_add] into a
separate .text.unlikely section and the section symbol ".text.unlikely"
is being dropped. Due to this, recordmcount is unable to find a non-weak
symbol in .text.unlikely to generate a relocation record against.

Address this by dropping the weak attribute from these functions.
Instead, follow the existing pattern of having architectures #define the
name of the function they want to override in their headers.

[1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1

[akpm@linux-foundation.org: arch/s390/include/asm/kexec.h needs linux/module.h]
Link: https://lkml.kernel.org/r/20220519091237.676736-1-naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_file.c | 34 ----------------------------------
 1 file changed, 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..c108a2a88754 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -108,40 +108,6 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 }
 #endif
 
-/*
- * arch_kexec_apply_relocations_add - apply relocations of type RELA
- * @pi:		Purgatory to be relocated.
- * @section:	Section relocations applying to.
- * @relsec:	Section containing RELAs.
- * @symtab:	Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
-				 const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
-	pr_err("RELA relocation unsupported.\n");
-	return -ENOEXEC;
-}
-
-/*
- * arch_kexec_apply_relocations - apply relocations of type REL
- * @pi:		Purgatory to be relocated.
- * @section:	Section relocations applying to.
- * @relsec:	Section containing RELs.
- * @symtab:	Corresponding symtab.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __weak
-arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
-			     const Elf_Shdr *relsec, const Elf_Shdr *symtab)
-{
-	pr_err("REL relocation unsupported.\n");
-	return -ENOEXEC;
-}
-
 /*
  * Free up memory used by kernel, initrd, and command line. This is temporary
  * memory allocation which is not needed any more after these buffers have
-- 
cgit v1.2.3


From caff1fa4118cec4dfd4336521ebd22a6408a1e3e Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Tue, 24 May 2022 10:12:27 +0800
Subject: bpf: Fix probe read error in ___bpf_prog_run()

I think there is something wrong with BPF_PROBE_MEM in ___bpf_prog_run()
in big-endian machine. Let's make a test and see what will happen if we
want to load a 'u16' with BPF_PROBE_MEM.

Let's make the src value '0x0001', the value of dest register will become
0x0001000000000000, as the value will be loaded to the first 2 byte of
DST with following code:

  bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off));

Obviously, the value in DST is not correct. In fact, we can compare
BPF_PROBE_MEM with LDX_MEM_H:

  DST = *(SIZE *)(unsigned long) (SRC + insn->off);

If the memory load is done by LDX_MEM_H, the value in DST will be 0x1 now.

And I think this error results in the test case 'test_bpf_sk_storage_map'
failing:

  test_bpf_sk_storage_map:PASS:bpf_iter_bpf_sk_storage_map__open_and_load 0 nsec
  test_bpf_sk_storage_map:PASS:socket 0 nsec
  test_bpf_sk_storage_map:PASS:map_update 0 nsec
  test_bpf_sk_storage_map:PASS:socket 0 nsec
  test_bpf_sk_storage_map:PASS:map_update 0 nsec
  test_bpf_sk_storage_map:PASS:socket 0 nsec
  test_bpf_sk_storage_map:PASS:map_update 0 nsec
  test_bpf_sk_storage_map:PASS:attach_iter 0 nsec
  test_bpf_sk_storage_map:PASS:create_iter 0 nsec
  test_bpf_sk_storage_map:PASS:read 0 nsec
  test_bpf_sk_storage_map:FAIL:ipv6_sk_count got 0 expected 3
  $10/26 bpf_iter/bpf_sk_storage_map:FAIL

The code of the test case is simply, it will load sk->sk_family to the
register with BPF_PROBE_MEM and check if it is AF_INET6. With this patch,
now the test case 'bpf_iter' can pass:

  $10  bpf_iter:OK

Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter")
Signed-off-by: Menglong Dong <imagedong@tencent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Reviewed-by: Hao Peng <flyingpeng@tencent.com>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/bpf/20220524021228.533216-1-imagedong@tencent.com
---
 kernel/bpf/core.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cacd8684c3c4..5f6f3f829b36 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1953,6 +1953,11 @@ out:
 		CONT;							\
 	LDX_MEM_##SIZEOP:						\
 		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\
+		CONT;							\
+	LDX_PROBE_MEM_##SIZEOP:						\
+		bpf_probe_read_kernel(&DST, sizeof(SIZE),		\
+				      (const void *)(long) (SRC + insn->off));	\
+		DST = *((SIZE *)&DST);					\
 		CONT;
 
 	LDST(B,   u8)
@@ -1960,15 +1965,6 @@ out:
 	LDST(W,  u32)
 	LDST(DW, u64)
 #undef LDST
-#define LDX_PROBE(SIZEOP, SIZE)							\
-	LDX_PROBE_MEM_##SIZEOP:							\
-		bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off));	\
-		CONT;
-	LDX_PROBE(B,  1)
-	LDX_PROBE(H,  2)
-	LDX_PROBE(W,  4)
-	LDX_PROBE(DW, 8)
-#undef LDX_PROBE
 
 #define ATOMIC_ALU_OP(BOP, KOP)						\
 		case BOP:						\
-- 
cgit v1.2.3


From b39181f7c6907dc66ff937b74758671fa6ba430c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 26 May 2022 14:19:12 -0400
Subject: ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function

If an unused weak function was traced, it's call to fentry will still
exist, which gets added into the __mcount_loc table. Ftrace will use
kallsyms to retrieve the name for each location in __mcount_loc to display
it in the available_filter_functions and used to enable functions via the
name matching in set_ftrace_filter/notrace. Enabling these functions do
nothing but enable an unused call to ftrace_caller. If a traced weak
function is overridden, the symbol of the function would be used for it,
which will either created duplicate names, or if the previous function was
not traced, it would be incorrectly be listed in available_filter_functions
as a function that can be traced.

This became an issue with BPF[1] as there are tooling that enables the
direct callers via ftrace but then checks to see if the functions were
actually enabled. The case of one function that was marked notrace, but
was followed by an unused weak function that was traced. The unused
function's call to fentry was added to the __mcount_loc section, and
kallsyms retrieved the untraced function's symbol as the weak function was
overridden. Since the untraced function would not get traced, the BPF
check would detect this and fail.

The real fix would be to fix kallsyms to not show addresses of weak
functions as the function before it. But that would require adding code in
the build to add function size to kallsyms so that it can know when the
function ends instead of just using the start of the next known symbol.

In the mean time, this is a work around. Add a FTRACE_MCOUNT_MAX_OFFSET
macro that if defined, ftrace will ignore any function that has its call
to fentry/mcount that has an offset from the symbol that is greater than
FTRACE_MCOUNT_MAX_OFFSET.

If CONFIG_HAVE_FENTRY is defined for x86, define FTRACE_MCOUNT_MAX_OFFSET
to zero (unless IBT is enabled), which will have ftrace ignore all locations
that are not at the start of the function (or one after the ENDBR
instruction).

A worker thread is added at boot up to scan all the ftrace record entries,
and will mark any that fail the FTRACE_MCOUNT_MAX_OFFSET test as disabled.
They will still appear in the available_filter_functions file as:

  __ftrace_invalid_address___<invalid-offset>

(showing the offset that caused it to be invalid).

This is required for tools that use libtracefs (like trace-cmd does) that
scan the available_filter_functions and enable set_ftrace_filter and
set_ftrace_notrace using indexes of the function listed in the file (this
is a speedup, as enabling thousands of files via names is an O(n^2)
operation and can take minutes to complete, where the indexing takes less
than a second).

The invalid functions cannot be removed from available_filter_functions as
the names there correspond to the ftrace records in the array that manages
them (and the indexing depends on this).

[1] https://lore.kernel.org/all/20220412094923.0abe90955e5db486b7bca279@kernel.org/

Link: https://lkml.kernel.org/r/20220526141912.794c2786@gandalf.local.home

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 139 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d653ef4febc5..c5088c76a108 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,8 @@
 #include "trace_output.h"
 #include "trace_stat.h"
 
+#define FTRACE_INVALID_FUNCTION		"__ftrace_invalid_address__"
+
 #define FTRACE_WARN_ON(cond)			\
 	({					\
 		int ___r = cond;		\
@@ -3654,6 +3656,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
 		seq_printf(m, " ->%pS", ptr);
 }
 
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+/*
+ * Weak functions can still have an mcount/fentry that is saved in
+ * the __mcount_loc section. These can be detected by having a
+ * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the
+ * symbol found by kallsyms is not the function that the mcount/fentry
+ * is part of. The offset is much greater in these cases.
+ *
+ * Test the record to make sure that the ip points to a valid kallsyms
+ * and if not, mark it disabled.
+ */
+static int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+	char str[KSYM_SYMBOL_LEN];
+	unsigned long offset;
+	const char *ret;
+
+	ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str);
+
+	/* Weak functions can cause invalid addresses */
+	if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+		rec->flags |= FTRACE_FL_DISABLED;
+		return 0;
+	}
+	return 1;
+}
+
+static struct workqueue_struct *ftrace_check_wq __initdata;
+static struct work_struct ftrace_check_work __initdata;
+
+/*
+ * Scan all the mcount/fentry entries to make sure they are valid.
+ */
+static __init void ftrace_check_work_func(struct work_struct *work)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		test_for_valid_rec(rec);
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static int __init ftrace_check_for_weak_functions(void)
+{
+	INIT_WORK(&ftrace_check_work, ftrace_check_work_func);
+
+	ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0);
+
+	queue_work(ftrace_check_wq, &ftrace_check_work);
+	return 0;
+}
+
+static int __init ftrace_check_sync(void)
+{
+	/* Make sure the ftrace_check updates are finished */
+	if (ftrace_check_wq)
+		destroy_workqueue(ftrace_check_wq);
+	return 0;
+}
+
+late_initcall_sync(ftrace_check_sync);
+subsys_initcall(ftrace_check_for_weak_functions);
+
+static int print_rec(struct seq_file *m, unsigned long ip)
+{
+	unsigned long offset;
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+	const char *ret;
+
+	ret = kallsyms_lookup(ip, NULL, &offset, &modname, str);
+	/* Weak functions can cause invalid addresses */
+	if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) {
+		snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld",
+			 FTRACE_INVALID_FUNCTION, offset);
+		ret = NULL;
+	}
+
+	seq_puts(m, str);
+	if (modname)
+		seq_printf(m, " [%s]", modname);
+	return ret == NULL ? -1 : 0;
+}
+#else
+static inline int test_for_valid_rec(struct dyn_ftrace *rec)
+{
+	return 1;
+}
+
+static inline int print_rec(struct seq_file *m, unsigned long ip)
+{
+	seq_printf(m, "%ps", (void *)ip);
+	return 0;
+}
+#endif
+
 static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_iterator *iter = m->private;
@@ -3678,7 +3779,13 @@ static int t_show(struct seq_file *m, void *v)
 	if (!rec)
 		return 0;
 
-	seq_printf(m, "%ps", (void *)rec->ip);
+	if (print_rec(m, rec->ip)) {
+		/* This should only happen when a rec is disabled */
+		WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED));
+		seq_putc(m, '\n');
+		return 0;
+	}
+
 	if (iter->flags & FTRACE_ITER_ENABLED) {
 		struct ftrace_ops *ops;
 
@@ -3996,6 +4103,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
 	return 0;
 }
 
+#ifdef FTRACE_MCOUNT_MAX_OFFSET
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+	unsigned long offset;
+
+	kallsyms_lookup(ip, NULL, &offset, modname, str);
+	if (offset > FTRACE_MCOUNT_MAX_OFFSET)
+		return -1;
+	return 0;
+}
+#else
+static int lookup_ip(unsigned long ip, char **modname, char *str)
+{
+	kallsyms_lookup(ip, NULL, NULL, modname, str);
+	return 0;
+}
+#endif
+
 static int
 ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
 		struct ftrace_glob *mod_g, int exclude_mod)
@@ -4003,7 +4128,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
 	char str[KSYM_SYMBOL_LEN];
 	char *modname;
 
-	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+	if (lookup_ip(rec->ip, &modname, str)) {
+		/* This should only happen when a rec is disabled */
+		WARN_ON_ONCE(system_state == SYSTEM_RUNNING &&
+			     !(rec->flags & FTRACE_FL_DISABLED));
+		return 0;
+	}
 
 	if (mod_g) {
 		int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
@@ -6819,6 +6949,13 @@ void ftrace_module_enable(struct module *mod)
 		    !within_module_init(rec->ip, mod))
 			break;
 
+		/* Weak functions should still be ignored */
+		if (!test_for_valid_rec(rec)) {
+			/* Clear all other flags. Should not be enabled anyway */
+			rec->flags = FTRACE_FL_DISABLED;
+			continue;
+		}
+
 		cnt = 0;
 
 		/*
-- 
cgit v1.2.3


From 82f586f923e3ac6062bc7867717a7f8afc09e0ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 May 2022 09:45:38 +0200
Subject: sched/autogroup: Fix sysctl move

Ivan reported /proc/sys/kernel/sched_autogroup_enabled went walk-about
and using the noautogroup command line parameter would result in a
boot error message.

Turns out the sysctl move placed the init function wrong.

Fixes: c8eaf6ac76f4 ("sched: move autogroup sysctls into its own file")
Reported-by: Ivan Kozik <ivan@ludios.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Ivan Kozik <ivan@ludios.org>
Link: https://lkml.kernel.org/r/YpR2IqndgsyMzN00@worktop.programming.kicks-ass.net
---
 kernel/sched/autogroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 16092b49ff6a..4ebaf97f7bd8 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -36,6 +36,7 @@ void __init autogroup_init(struct task_struct *init_task)
 	kref_init(&autogroup_default.kref);
 	init_rwsem(&autogroup_default.lock);
 	init_task->signal->autogroup = &autogroup_default;
+	sched_autogroup_sysctl_init();
 }
 
 void autogroup_free(struct task_group *tg)
@@ -219,7 +220,6 @@ void sched_autogroup_exit(struct signal_struct *sig)
 static int __init setup_autogroup(char *str)
 {
 	sysctl_sched_autogroup_enabled = 0;
-	sched_autogroup_sysctl_init();
 
 	return 1;
 }
-- 
cgit v1.2.3


From ff979b2a9d9779382030023bfc4e3b1989c8c314 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Fri, 6 May 2022 11:27:37 +0800
Subject: ftrace/fgraph: fix increased missing-prototypes warnings

After commit e999995c84c3 ("ftrace: cleanup ftrace_graph_caller enable
and disable") merged into the linux-next tree, the kernel test robot
(lkp@intel.com) has send out report that there are increased missing-prototypes
warnings caused by that commit.

COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 \
	O=build_dir ARCH=sh SHELL=/bin/bash kernel/trace/
warning: no previous prototype for 'ftrace_enable_ftrace_graph_caller' [-Wmissing-prototypes]
warning: no previous prototype for 'ftrace_disable_ftrace_graph_caller' [-Wmissing-prototypes]
warning: no previous prototype for 'ftrace_return_to_handler' [-Wmissing-prototypes]
warning: no previous prototype for 'ftrace_graph_sleep_time_control' [-Wmissing-prototypes]

BTW there are so many missing-prototypes warnings if build kernel with "W=1".

The increased warnings for 'ftrace_[enable,disable]_ftrace_graph_caller'
is caused by CONFIG_FUNCTION_GRAPH_TRACER && !CONFIG_DYNAMIC_FTRACE,
so the declarations in <linux/ftrace.h> can't be seen in fgraph.c.

And this warning can't reproduce on x86_64 since x86_64 select
HAVE_FUNCTION_GRAPH_TRACER only when DYNAMIC_FTRACE, so fgraph.c will
always see the declarations in <linux/ftrace.h>.

This patch fix the increased warnings by put the definitions in
CONFIG_DYNAMIC_FTRACE although there are no real problems exist.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220506032737.23375-1-zhouchengming@bytedance.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/trace/fgraph.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 289311680c29..2cd374294be7 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -30,6 +30,7 @@ int ftrace_graph_active;
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
+#ifdef CONFIG_DYNAMIC_FTRACE
 /*
  * archs can override this function if they must do something
  * to enable hook for graph tracer.
@@ -47,6 +48,7 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 {
 	return 0;
 }
+#endif
 
 /**
  * ftrace_graph_stop - set to permanently disable function graph tracing
-- 
cgit v1.2.3


From 662ce1dc9caf493c309200edbe38d186f1ea20d0 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Wed, 1 Jun 2022 15:55:25 -0700
Subject: delayacct: track delays from write-protect copy

Delay accounting does not track the delay of write-protect copy.  When
tasks trigger many write-protect copys(include COW and unsharing of
anonymous pages[1]), it may spend a amount of time waiting for them.  To
get the delay of tasks in write-protect copy, could help users to evaluate
the impact of using KSM or fork() or GUP.

Also update tools/accounting/getdelays.c:

    / # ./getdelays -dl -p 231
    print delayacct stats ON
    listen forever
    PID     231

    CPU             count     real total  virtual total    delay total  delay average
                     6247     1859000000     2154070021     1674255063          0.268ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        0              0              0ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    COMPACT         count    delay total  delay average
                        3          72758              0ms
    WPCOPY          count    delay total  delay average
                     3635      271567604              0ms

[1] commit 31cc5bc4af70("mm: support GUP-triggered unsharing of anonymous pages")

Link: https://lkml.kernel.org/r/20220409014342.2505532-1-yang.yang29@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Reviewed-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: wangyong <wang.yong12@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/delayacct.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2c1e18f7c5cf..164ed9ef77a3 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -177,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
 	tmp = d->compact_delay_total + tsk->delays->compact_delay;
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+	tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
+	d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
+	d->wpcopy_count += tsk->delays->wpcopy_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -249,3 +252,16 @@ void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count);
 }
+
+void __delayacct_wpcopy_start(void)
+{
+	current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+	delayacct_end(&current->delays->lock,
+		      &current->delays->wpcopy_start,
+		      &current->delays->wpcopy_delay,
+		      &current->delays->wpcopy_count);
+}
-- 
cgit v1.2.3


From e19f8fa6ce1ca9b8b934ba7d2e8f34c95abc6e60 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Wed, 1 Jun 2022 07:51:16 -0700
Subject: dma-debug: make things less spammy under memory pressure

Limit the error msg to avoid flooding the console.  If you have a lot of
threads hitting this at once, they could have already gotten passed the
dma_debug_disabled() check before they get to the point of allocation
failure, resulting in quite a lot of this error message spamming the
log.  Use pr_err_once() to limit that.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index ac740630c79c..2caafd13f8aa 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -564,7 +564,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
 
 	rc = active_cacheline_insert(entry);
 	if (rc == -ENOMEM) {
-		pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
+		pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
 		global_disable = true;
 	} else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
 		err_printk(entry->dev, entry,
-- 
cgit v1.2.3


From e15db62bc5648ab459a570862f654e787c498faf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 Jun 2022 20:49:39 +0200
Subject: swiotlb: fix setting ->force_bounce

The swiotlb_init refactor messed up assigning ->force_bounce by doing
it in different places based on what caused the setting of the flag.

Fix this by passing the SWIOTLB_* flags to swiotlb_init_io_tlb_mem
and just setting it there.

Fixes: c6af2aa9ffc9 ("swiotlb: make the swiotlb_init interface more useful")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Nathan Chancellor <nathan@kernel.org>
---
 kernel/dma/swiotlb.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index dfa1de89dc94..cb50f8d38360 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -192,7 +192,7 @@ void __init swiotlb_update_mem_attributes(void)
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
-				    unsigned long nslabs, bool late_alloc)
+		unsigned long nslabs, unsigned int flags, bool late_alloc)
 {
 	void *vaddr = phys_to_virt(start);
 	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -203,8 +203,7 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->index = 0;
 	mem->late_alloc = late_alloc;
 
-	if (swiotlb_force_bounce)
-		mem->force_bounce = true;
+	mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
 
 	spin_lock_init(&mem->lock);
 	for (i = 0; i < mem->nslabs; i++) {
@@ -275,8 +274,7 @@ retry:
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
-	mem->force_bounce = flags & SWIOTLB_FORCE;
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false);
 
 	if (flags & SWIOTLB_VERBOSE)
 		swiotlb_print_info();
@@ -348,7 +346,7 @@ retry:
 
 	set_memory_decrypted((unsigned long)vstart,
 			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
-	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, true);
+	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true);
 
 	swiotlb_print_info();
 	return 0;
@@ -835,8 +833,8 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 
 		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
 				     rmem->size >> PAGE_SHIFT);
-		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
-		mem->force_bounce = true;
+		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
+				false);
 		mem->for_alloc = true;
 
 		rmem->priv = mem;
-- 
cgit v1.2.3


From 587b9bfe0668bc997e51af9526a0c7c084d4660f Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Wed, 1 Jun 2022 01:11:02 +0300
Subject: kernel/reboot: Use static handler for register_platform_power_off()

The register_platform_power_off() fails on m68k platform due to the
memory allocation error that happens at a very early boot time when
memory allocator isn't available yet. Fix it by using a static sys-off
handler for the platform-level power-off handlers.

Fixes: f0f7e5265b3b ("m68k: Switch to new sys-off handler API")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index a091145ee710..3b19b123efec 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -315,6 +315,37 @@ static int sys_off_notify(struct notifier_block *nb,
 	return handler->sys_off_cb(&data);
 }
 
+static struct sys_off_handler platform_sys_off_handler;
+
+static struct sys_off_handler *alloc_sys_off_handler(int priority)
+{
+	struct sys_off_handler *handler;
+
+	/*
+	 * Platforms like m68k can't allocate sys_off handler dynamically
+	 * at the early boot time because memory allocator isn't available yet.
+	 */
+	if (priority == SYS_OFF_PRIO_PLATFORM) {
+		handler = &platform_sys_off_handler;
+		if (handler->cb_data)
+			return ERR_PTR(-EBUSY);
+	} else {
+		handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+		if (!handler)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	return handler;
+}
+
+static void free_sys_off_handler(struct sys_off_handler *handler)
+{
+	if (handler == &platform_sys_off_handler)
+		memset(handler, 0, sizeof(*handler));
+	else
+		kfree(handler);
+}
+
 /**
  *	register_sys_off_handler - Register sys-off handler
  *	@mode: Sys-off mode
@@ -345,9 +376,9 @@ register_sys_off_handler(enum sys_off_mode mode,
 	struct sys_off_handler *handler;
 	int err;
 
-	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
-	if (!handler)
-		return ERR_PTR(-ENOMEM);
+	handler = alloc_sys_off_handler(priority);
+	if (IS_ERR(handler))
+		return handler;
 
 	switch (mode) {
 	case SYS_OFF_MODE_POWER_OFF_PREPARE:
@@ -364,7 +395,7 @@ register_sys_off_handler(enum sys_off_mode mode,
 		break;
 
 	default:
-		kfree(handler);
+		free_sys_off_handler(handler);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -391,7 +422,7 @@ register_sys_off_handler(enum sys_off_mode mode,
 	}
 
 	if (err) {
-		kfree(handler);
+		free_sys_off_handler(handler);
 		return ERR_PTR(err);
 	}
 
@@ -422,7 +453,7 @@ void unregister_sys_off_handler(struct sys_off_handler *handler)
 	/* sanity check, shall never happen */
 	WARN_ON(err);
 
-	kfree(handler);
+	free_sys_off_handler(handler);
 }
 EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
 
-- 
cgit v1.2.3


From 73503963b715a64a44aa2b1c486114b917a17c73 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 1 Jun 2022 20:56:52 -0700
Subject: module: Fix prefix for module.sig_enforce module param

Commit cfc1d277891e ("module: Move all into module/") changed the prefix
of the module param by moving/renaming files.  A later commit also moves
the module_param() into a different file, thereby changing the prefix
yet again.

This would break kernel cmdline compatibility and also userspace
compatibility at /sys/module/module/parameters/sig_enforce.

So, set the prefix back to "module.".

Fixes: cfc1d277891e ("module: Move all into module/")
Link: https://lore.kernel.org/lkml/20220602034111.4163292-1-saravanak@google.com/
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Aaron Tomlin <atomlin@redhat.com>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module/signing.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module/signing.c b/kernel/module/signing.c
index 85c8999dfecf..a2ff4242e623 100644
--- a/kernel/module/signing.c
+++ b/kernel/module/signing.c
@@ -16,6 +16,9 @@
 #include <uapi/linux/module.h>
 #include "internal.h"
 
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
 static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
 module_param(sig_enforce, bool_enable_only, 0644);
 
-- 
cgit v1.2.3


From 2130a790ca49763f724ec45cf93b9dd765e2023e Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 2 Jun 2022 15:05:26 +0200
Subject: kernel: add platform_has() infrastructure

Add a simple infrastructure for setting, resetting and querying
platform feature flags.

Flags can be either global or architecture specific.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Tested-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> # Arm64 only
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 kernel/Makefile           |  2 +-
 kernel/platform-feature.c | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 kernel/platform-feature.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 318789c728d3..a7e1f49ab2b3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    cpu.o exit.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o user.o \
 	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
-	    extable.o params.o \
+	    extable.o params.o platform-feature.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o regset.o
diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c
new file mode 100644
index 000000000000..cb6a6c3e4fed
--- /dev/null
+++ b/kernel/platform-feature.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/export.h>
+#include <linux/platform-feature.h>
+
+#define PLATFORM_FEAT_ARRAY_SZ  BITS_TO_LONGS(PLATFORM_FEAT_N)
+static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ];
+
+void platform_set(unsigned int feature)
+{
+	set_bit(feature, platform_features);
+}
+EXPORT_SYMBOL_GPL(platform_set);
+
+void platform_clear(unsigned int feature)
+{
+	clear_bit(feature, platform_features);
+}
+EXPORT_SYMBOL_GPL(platform_clear);
+
+bool platform_has(unsigned int feature)
+{
+	return test_bit(feature, platform_features);
+}
+EXPORT_SYMBOL_GPL(platform_has);
-- 
cgit v1.2.3


From 3e684903a8574ffc9475fdf13c4780a7adb506ad Mon Sep 17 00:00:00 2001
From: Seth Forshee <sforshee@digitalocean.com>
Date: Wed, 4 May 2022 13:08:40 -0500
Subject: entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set

A livepatch transition may stall indefinitely when a kvm vCPU is heavily
loaded. To the host, the vCPU task is a user thread which is spending a
very long time in the ioctl(KVM_RUN) syscall. During livepatch
transition, set_notify_signal() will be called on such tasks to
interrupt the syscall so that the task can be transitioned. This
interrupts guest execution, but when xfer_to_guest_mode_work() sees that
TIF_NOTIFY_SIGNAL is set but not TIF_SIGPENDING it concludes that an
exit to user mode is unnecessary, and guest execution is resumed without
transitioning the task for the livepatch.

This handling of TIF_NOTIFY_SIGNAL is incorrect, as set_notify_signal()
is expected to break tasks out of interruptible kernel loops and cause
them to return to userspace. Change xfer_to_guest_mode_work() to handle
TIF_NOTIFY_SIGNAL the same as TIF_SIGPENDING, signaling to the vCPU run
loop that an exit to userpsace is needed. Any pending task_work will be
run when get_signal() is called from exit_to_user_mode_loop(), so there
is no longer any need to run task work from xfer_to_guest_mode_work().

Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Petr Mladek <pmladek@suse.com>
Signed-off-by: Seth Forshee <sforshee@digitalocean.com>
Message-Id: <20220504180840.2907296-1-sforshee@digitalocean.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/entry/kvm.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 9d09f489b60e..2e0f75bcb7fd 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 		int ret;
 
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
-			clear_notify_signal();
-			if (task_work_pending(current))
-				task_work_run();
-		}
-
-		if (ti_work & _TIF_SIGPENDING) {
 			kvm_handle_signal_exit(vcpu);
 			return -EINTR;
 		}
-- 
cgit v1.2.3


From c4f135d643823a869becfa87539f7820ef9d5bfa Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 1 Jun 2022 16:32:47 +0900
Subject: workqueue: Wrap flush_workqueue() using a macro

Since flush operation synchronously waits for completion, flushing
system-wide WQs (e.g. system_wq) might introduce possibility of deadlock
due to unexpected locking dependency. Tejun Heo commented at [1] that it
makes no sense at all to call flush_workqueue() on the shared WQs as the
caller has no idea what it's gonna end up waiting for.

Although there is flush_scheduled_work() which flushes system_wq WQ with
"Think twice before calling this function! It's very easy to get into
trouble if you don't take great care." warning message, syzbot found a
circular locking dependency caused by flushing system_wq WQ [2].

Therefore, let's change the direction to that developers had better use
their local WQs if flush_scheduled_work()/flush_workqueue(system_*_wq) is
inevitable.

Steps for converting system-wide WQs into local WQs are explained at [3],
and a conversion to stop flushing system-wide WQs is in progress. Now we
want some mechanism for preventing developers who are not aware of this
conversion from again start flushing system-wide WQs.

Since I found that WARN_ON() is complete but awkward approach for teaching
developers about this problem, let's use __compiletime_warning() for
incomplete but handy approach. For completeness, we will also insert
WARN_ON() into __flush_workqueue() after all in-tree users stopped calling
flush_scheduled_work().

Link: https://lore.kernel.org/all/YgnQGZWT%2Fn3VAITX@slm.duckdns.org/ [1]
Link: https://syzkaller.appspot.com/bug?extid=bde0f89deacca7c765b8 [2]
Link: https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp [3]
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4056f2a3f9d5..1ea50f6be843 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2788,13 +2788,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
 }
 
 /**
- * flush_workqueue - ensure that any scheduled work has run to completion.
+ * __flush_workqueue - ensure that any scheduled work has run to completion.
  * @wq: workqueue to flush
  *
  * This function sleeps until all work items which were queued on entry
  * have finished execution, but it is not livelocked by new incoming ones.
  */
-void flush_workqueue(struct workqueue_struct *wq)
+void __flush_workqueue(struct workqueue_struct *wq)
 {
 	struct wq_flusher this_flusher = {
 		.list = LIST_HEAD_INIT(this_flusher.list),
@@ -2943,7 +2943,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 out_unlock:
 	mutex_unlock(&wq->mutex);
 }
-EXPORT_SYMBOL(flush_workqueue);
+EXPORT_SYMBOL(__flush_workqueue);
 
 /**
  * drain_workqueue - drain a workqueue
@@ -2971,7 +2971,7 @@ void drain_workqueue(struct workqueue_struct *wq)
 		wq->flags |= __WQ_DRAINING;
 	mutex_unlock(&wq->mutex);
 reflush:
-	flush_workqueue(wq);
+	__flush_workqueue(wq);
 
 	mutex_lock(&wq->mutex);
 
@@ -6111,3 +6111,11 @@ void __init workqueue_init(void)
 	wq_online = true;
 	wq_watchdog_init();
 }
+
+/*
+ * Despite the naming, this is a no-op function which is here only for avoiding
+ * link error. Since compile-time warning may fail to catch, we will need to
+ * emit run-time warning from __flush_workqueue().
+ */
+void __warn_flushing_systemwide_wq(void) { }
+EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
-- 
cgit v1.2.3


From fd58f7df2415ef747782e01f94880fefad1247cf Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 26 May 2022 13:24:05 +0300
Subject: bpf: Use safer kvmalloc_array() where possible

The kvmalloc_array() function is safer because it has a check for
integer overflows.  These sizes come from the user and I was not
able to see any bounds checking so an integer overflow seems like a
realistic concern.

Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/Yo9VRVMeHbALyjUH@kili
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 10b157a6d73e..7a13e6ac6327 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2263,11 +2263,11 @@ static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32
 	int err = -ENOMEM;
 	unsigned int i;
 
-	syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL);
+	syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
 	if (!syms)
 		goto error;
 
-	buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL);
+	buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
 	if (!buf)
 		goto error;
 
@@ -2464,7 +2464,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		return -EINVAL;
 
 	size = cnt * sizeof(*addrs);
-	addrs = kvmalloc(size, GFP_KERNEL);
+	addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
 	if (!addrs)
 		return -ENOMEM;
 
@@ -2489,7 +2489,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 
 	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
 	if (ucookies) {
-		cookies = kvmalloc(size, GFP_KERNEL);
+		cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
 		if (!cookies) {
 			err = -ENOMEM;
 			goto error;
-- 
cgit v1.2.3


From f858c2b2ca04fc7ead291821a793638ae120c11d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 6 Jun 2022 09:52:51 +0200
Subject: bpf: Fix calling global functions from BPF_PROG_TYPE_EXT programs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The verifier allows programs to call global functions as long as their
argument types match, using BTF to check the function arguments. One of the
allowed argument types to such global functions is PTR_TO_CTX; however the
check for this fails on BPF_PROG_TYPE_EXT functions because the verifier
uses the wrong type to fetch the vmlinux BTF ID for the program context
type. This failure is seen when an XDP program is loaded using
libxdp (which loads it as BPF_PROG_TYPE_EXT and attaches it to a global XDP
type program).

Fix the issue by passing in the target program type instead of the
BPF_PROG_TYPE_EXT type to bpf_prog_get_ctx() when checking function
argument compatibility.

The first Fixes tag refers to the latest commit that touched the code in
question, while the second one points to the code that first introduced
the global function call verification.

v2:
- Use resolve_prog_type()

Fixes: 3363bd0cfbb8 ("bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support")
Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification")
Reported-by: Simon Sundberg <simon.sundberg@kau.se>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20220606075253.28422-1-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7bccaa4646e5..63d0ac7dfe2f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6054,6 +6054,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    struct bpf_reg_state *regs,
 				    bool ptr_to_mem_ok)
 {
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
 	bool is_kfunc = btf_is_kernel(btf);
@@ -6171,7 +6172,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				return -EINVAL;
 			}
 			/* rest of the arguments can be anything, like normal kfunc */
-		} else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+		} else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
-- 
cgit v1.2.3


From 2b8c612c6102f751e6e3e1bd425f64e9d3d3f638 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Mon, 6 Jun 2022 19:56:40 +0300
Subject: kernel/reboot: Fix powering off using a non-syscall code paths

There are other methods of powering off machine than the reboot syscall.
Previously we missed to cover those methods and it created power-off
regression for some machines, like the PowerPC e500.

Fix this problem by moving the legacy sys-off handler registration to
the latest phase of power-off process and making the kernel_can_power_off()
check the legacy pm_power_off presence.

Tested-by: Michael Ellerman <mpe@ellerman.id.au> # ppce500
Reported-by: Michael Ellerman <mpe@ellerman.id.au> # ppce500
Fixes: da007f171fc9 ("kernel/reboot: Change registration order of legacy power-off handler")
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/reboot.c | 46 ++++++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3b19b123efec..b5a71d1ff603 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -320,6 +320,7 @@ static struct sys_off_handler platform_sys_off_handler;
 static struct sys_off_handler *alloc_sys_off_handler(int priority)
 {
 	struct sys_off_handler *handler;
+	gfp_t flags;
 
 	/*
 	 * Platforms like m68k can't allocate sys_off handler dynamically
@@ -330,7 +331,12 @@ static struct sys_off_handler *alloc_sys_off_handler(int priority)
 		if (handler->cb_data)
 			return ERR_PTR(-EBUSY);
 	} else {
-		handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+		if (system_state > SYSTEM_RUNNING)
+			flags = GFP_ATOMIC;
+		else
+			flags = GFP_KERNEL;
+
+		handler = kzalloc(sizeof(*handler), flags);
 		if (!handler)
 			return ERR_PTR(-ENOMEM);
 	}
@@ -440,7 +446,7 @@ void unregister_sys_off_handler(struct sys_off_handler *handler)
 {
 	int err;
 
-	if (!handler)
+	if (IS_ERR_OR_NULL(handler))
 		return;
 
 	if (handler->blocking)
@@ -615,7 +621,23 @@ static void do_kernel_power_off_prepare(void)
  */
 void do_kernel_power_off(void)
 {
+	struct sys_off_handler *sys_off = NULL;
+
+	/*
+	 * Register sys-off handlers for legacy PM callback. This allows
+	 * legacy PM callbacks temporary co-exist with the new sys-off API.
+	 *
+	 * TODO: Remove legacy handlers once all legacy PM users will be
+	 *       switched to the sys-off based APIs.
+	 */
+	if (pm_power_off)
+		sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+						   SYS_OFF_PRIO_DEFAULT,
+						   legacy_pm_power_off, NULL);
+
 	atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+
+	unregister_sys_off_handler(sys_off);
 }
 
 /**
@@ -626,7 +648,8 @@ void do_kernel_power_off(void)
  */
 bool kernel_can_power_off(void)
 {
-	return !atomic_notifier_call_chain_is_empty(&power_off_handler_list);
+	return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) ||
+		pm_power_off;
 }
 EXPORT_SYMBOL_GPL(kernel_can_power_off);
 
@@ -661,7 +684,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		void __user *, arg)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
-	struct sys_off_handler *sys_off = NULL;
 	char buffer[256];
 	int ret = 0;
 
@@ -686,21 +708,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	if (ret)
 		return ret;
 
-	/*
-	 * Register sys-off handlers for legacy PM callback. This allows
-	 * legacy PM callbacks temporary co-exist with the new sys-off API.
-	 *
-	 * TODO: Remove legacy handlers once all legacy PM users will be
-	 *       switched to the sys-off based APIs.
-	 */
-	if (pm_power_off) {
-		sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
-						   SYS_OFF_PRIO_DEFAULT,
-						   legacy_pm_power_off, NULL);
-		if (IS_ERR(sys_off))
-			return PTR_ERR(sys_off);
-	}
-
 	/* Instead of trying to make the power_off code look like
 	 * halt when pm_power_off is not set do it the easy way.
 	 */
@@ -758,7 +765,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 		break;
 	}
 	mutex_unlock(&system_transition_mutex);
-	unregister_sys_off_handler(sys_off);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 668a9fe5c6a1bcac6b65d5e9b91a9eca86f782a3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 8 Jun 2022 14:45:35 +0100
Subject: genirq: PM: Use runtime PM for chained interrupts

When requesting an interrupt, we correctly call into the runtime
PM framework to guarantee that the underlying interrupt controller
is up and running.

However, we fail to do so for chained interrupt controllers, as
the mux interrupt is not requested along the same path.

Augment __irq_do_set_handler() to call into the runtime PM code
in this case, making sure the PM flow is the same for all interrupts.

Reported-by: Lucas Stach <l.stach@pengutronix.de>
Tested-by: Liu Ying <victor.liu@nxp.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/26973cddee5f527ea17184c0f3fccb70bc8969a0.camel@pengutronix.de
---
 kernel/irq/chip.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e6b8e564b37f..886789dcee43 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1006,8 +1006,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		if (desc->irq_data.chip != &no_irq_chip)
 			mask_ack_irq(desc);
 		irq_state_set_disabled(desc);
-		if (is_chained)
+		if (is_chained) {
 			desc->action = NULL;
+			WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+		}
 		desc->depth = 1;
 	}
 	desc->handle_irq = handle;
@@ -1033,6 +1035,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		irq_settings_set_norequest(desc);
 		irq_settings_set_nothread(desc);
 		desc->action = &chained_action;
+		WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
 		irq_activate_and_startup(desc, IRQ_RESEND);
 	}
 }
-- 
cgit v1.2.3


From 04193d590b390ec7a0592630f46d559ec6564ba1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 7 Jun 2022 22:41:55 +0200
Subject: sched: Fix balance_push() vs __sched_setscheduler()

The purpose of balance_push() is to act as a filter on task selection
in the case of CPU hotplug, specifically when taking the CPU out.

It does this by (ab)using the balance callback infrastructure, with
the express purpose of keeping all the unlikely/odd cases in a single
place.

In order to serve its purpose, the balance_push_callback needs to be
(exclusively) on the callback list at all times (noting that the
callback always places itself back on the list the moment it runs,
also noting that when the CPU goes down, regular balancing concerns
are moot, so ignoring them is fine).

And here-in lies the problem, __sched_setscheduler()'s use of
splice_balance_callbacks() takes the callbacks off the list across a
lock-break, making it possible for, an interleaving, __schedule() to
see an empty list and not get filtered.

Fixes: ae7927023243 ("sched: Optimize finish_lock_switch()")
Reported-by: Jing-Ting Wu <jing-ting.wu@mediatek.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jing-Ting Wu <jing-ting.wu@mediatek.com>
Link: https://lkml.kernel.org/r/20220519134706.GH2578@worktop.programming.kicks-ass.net
---
 kernel/sched/core.c  | 36 +++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |  5 +++++
 2 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfa7452ca92e..da0bf6fe9ecd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4798,25 +4798,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
 
 static void balance_push(struct rq *rq);
 
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
 struct callback_head balance_push_callback = {
 	.next = NULL,
 	.func = (void (*)(struct callback_head *))balance_push,
 };
 
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct callback_head *
+__splice_balance_callbacks(struct rq *rq, bool split)
 {
 	struct callback_head *head = rq->balance_callback;
 
+	if (likely(!head))
+		return NULL;
+
 	lockdep_assert_rq_held(rq);
-	if (head)
+	/*
+	 * Must not take balance_push_callback off the list when
+	 * splice_balance_callbacks() and balance_callbacks() are not
+	 * in the same rq->lock section.
+	 *
+	 * In that case it would be possible for __schedule() to interleave
+	 * and observe the list empty.
+	 */
+	if (split && head == &balance_push_callback)
+		head = NULL;
+	else
 		rq->balance_callback = NULL;
 
 	return head;
 }
 
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+	return __splice_balance_callbacks(rq, true);
+}
+
 static void __balance_callbacks(struct rq *rq)
 {
-	do_balance_callbacks(rq, splice_balance_callbacks(rq));
+	do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
 }
 
 static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 01259611beb9..47b89a0fc6e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1693,6 +1693,11 @@ queue_balance_callback(struct rq *rq,
 {
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Don't (re)queue an already queued item; nor queue anything when
+	 * balance_push() is active, see the comment with
+	 * balance_push_callback.
+	 */
 	if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
 		return;
 
-- 
cgit v1.2.3


From 4051a81774d6d8e28192742c26999d6f29bc0e68 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 17 May 2022 11:16:14 +0200
Subject: locking/lockdep: Use sched_clock() for random numbers

Since the rewrote of prandom_u32(), in the commit mentioned below, the
function uses sleeping locks which extracing random numbers and filling
the batch.
This breaks lockdep on PREEMPT_RT because lock_pin_lock() disables
interrupts while calling __lock_pin_lock(). This can't be moved earlier
because the main user of the function (rq_pin_lock()) invokes that
function after disabling interrupts in order to acquire the lock.

The cookie does not require random numbers as its goal is to provide a
random value in order to notice unexpected "unlock + lock" sites.

Use sched_clock() to provide random numbers.

Fixes: a0103f4d86f88 ("random32: use real rng for non-deterministic randomness")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YoNn3pTkm5+QzE5k@linutronix.de
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 81e87280513e..f06b91ca6482 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5432,7 +5432,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
 			 * be guessable and still allows some pin nesting in
 			 * our u32 pin_count.
 			 */
-			cookie.val = 1 + (prandom_u32() >> 16);
+			cookie.val = 1 + (sched_clock() & 0xffff);
 			hlock->pin_count += cookie.val;
 			return cookie;
 		}
-- 
cgit v1.2.3


From 57cd6d157eb479f0a8e820fd36b7240845c8a937 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Tue, 31 May 2022 10:59:10 -0700
Subject: cfi: Fix __cfi_slowpath_diag RCU usage with cpuidle

RCU_NONIDLE usage during __cfi_slowpath_diag can result in an invalid
RCU state in the cpuidle code path:

  WARNING: CPU: 1 PID: 0 at kernel/rcu/tree.c:613 rcu_eqs_enter+0xe4/0x138
  ...
  Call trace:
    rcu_eqs_enter+0xe4/0x138
    rcu_idle_enter+0xa8/0x100
    cpuidle_enter_state+0x154/0x3a8
    cpuidle_enter+0x3c/0x58
    do_idle.llvm.6590768638138871020+0x1f4/0x2ec
    cpu_startup_entry+0x28/0x2c
    secondary_start_kernel+0x1b8/0x220
    __secondary_switched+0x94/0x98

Instead, call rcu_irq_enter/exit to wake up RCU only when needed and
disable interrupts for the entire CFI shadow/module check when we do.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20220531175910.890307-1-samitolvanen@google.com
Fixes: cf68fffb66d6 ("add support for Clang CFI")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/cfi.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cfi.c b/kernel/cfi.c
index 9594cfd1cf2c..08102d19ec15 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
 static inline cfi_check_fn find_check_fn(unsigned long ptr)
 {
 	cfi_check_fn fn = NULL;
+	unsigned long flags;
+	bool rcu_idle;
 
 	if (is_kernel_text(ptr))
 		return __cfi_check;
@@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr)
 	 * the shadow and __module_address use RCU, so we need to wake it
 	 * up if necessary.
 	 */
-	RCU_NONIDLE({
-		if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
-			fn = find_shadow_check_fn(ptr);
+	rcu_idle = !rcu_is_watching();
+	if (rcu_idle) {
+		local_irq_save(flags);
+		rcu_irq_enter();
+	}
+
+	if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
+		fn = find_shadow_check_fn(ptr);
+	if (!fn)
+		fn = find_module_check_fn(ptr);
 
-		if (!fn)
-			fn = find_module_check_fn(ptr);
-	});
+	if (rcu_idle) {
+		rcu_irq_exit();
+		local_irq_restore(flags);
+	}
 
 	return fn;
 }
-- 
cgit v1.2.3


From d1a374a1aeb7e31191448e225ed2f9c5e894f280 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 15 Jun 2022 09:51:51 +0530
Subject: bpf: Limit maximum modifier chain length in btf_check_type_tags

On processing a module BTF of module built for an older kernel, we might
sometimes find that some type points to itself forming a loop. If such a
type is a modifier, btf_check_type_tags's while loop following modifier
chain will be caught in an infinite loop.

Fix this by defining a maximum chain length and bailing out if we spin
any longer than that.

Fixes: eb596b090558 ("bpf: Ensure type tags precede modifiers in BTF")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220615042151.2266537-1-memxor@gmail.com
---
 kernel/bpf/btf.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63d0ac7dfe2f..eb12d4f705cc 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4815,6 +4815,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	n = btf_nr_types(btf);
 	for (i = start_id; i < n; i++) {
 		const struct btf_type *t;
+		int chain_limit = 32;
 		u32 cur_id = i;
 
 		t = btf_type_by_id(btf, i);
@@ -4827,6 +4828,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 
 		in_tags = btf_type_is_type_tag(t);
 		while (btf_type_is_modifier(t)) {
+			if (!chain_limit--) {
+				btf_verifier_log(env, "Max chain length or cycle detected");
+				return -ELOOP;
+			}
 			if (btf_type_is_type_tag(t)) {
 				if (!in_tags) {
 					btf_verifier_log(env, "Type tags don't precede modifiers");
-- 
cgit v1.2.3


From c3230283e2819a69dad2cf7a63143fde8bab8b5c Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 15 Jun 2022 18:28:04 +0200
Subject: printk: Block console kthreads when direct printing will be required

There are known situations when the console kthreads are not
reliable or does not work in principle, for example, early boot,
panic, shutdown.

For these situations there is the direct (legacy) mode when printk() tries
to get console_lock() and flush the messages directly. It works very well
during the early boot when the console kthreads are not available at all.
It gets more complicated in the other situations when console kthreads
might be actively printing and block console_trylock() in printk().

The same problem is in the legacy code as well. Any console_lock()
owner could block console_trylock() in printk(). It is solved by
a trick that the current console_lock() owner is responsible for
printing all pending messages. It is actually the reason why there
is the risk of softlockups and why the console kthreads were
introduced.

The console kthreads use the same approach. They are responsible
for printing the messages by definition. So that they handle
the messages anytime when they are awake and see new ones.
The global console_lock is available when there is nothing
to do.

It should work well when the problematic context is correctly
detected and printk() switches to the direct mode. But it seems
that it is not enough in practice. There are reports that
the messages are not printed during panic() or shutdown()
even though printk() tries to use the direct mode here.

The problem seems to be that console kthreads become active in these
situation as well. They steel the job before other CPUs are stopped.
Then they are stopped in the middle of the job and block the global
console_lock.

First part of the solution is to block console kthreads when
the system is in a problematic state and requires the direct
printk() mode.

Link: https://lore.kernel.org/r/20220610205038.GA3050413@paulmck-ThinkPad-P17-Gen-1
Link: https://lore.kernel.org/r/CAMdYzYpF4FNTBPZsEFeWRuEwSies36QM_As8osPWZSr2q-viEA@mail.gmail.com
Suggested-by: John Ogness <john.ogness@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220615162805.27962-2-pmladek@suse.com
---
 kernel/printk/printk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea3dd55709e7..45c6c2b0b104 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3729,7 +3729,9 @@ static bool printer_should_wake(struct console *con, u64 seq)
 		return true;
 
 	if (con->blocked ||
-	    console_kthreads_atomically_blocked()) {
+	    console_kthreads_atomically_blocked() ||
+	    system_state > SYSTEM_RUNNING ||
+	    oops_in_progress) {
 		return false;
 	}
 
-- 
cgit v1.2.3


From b87f02307d3cfbda768520f0687c51ca77e14fc3 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 15 Jun 2022 18:28:05 +0200
Subject: printk: Wait for the global console lock when the system is going
 down

There are reports that the console kthreads block the global console
lock when the system is going down, for example, reboot, panic.

First part of the solution was to block kthreads in these problematic
system states so they stopped handling newly added messages.

Second part of the solution is to wait when for the kthreads when
they are actively printing. It solves the problem when a message
was printed before the system entered the problematic state and
the kthreads managed to step in.

A busy waiting has to be used because panic() can be called in any
context and in an unknown state of the scheduler.

There must be a timeout because the kthread might get stuck or sleeping
and never release the lock. The timeout 10s is an arbitrary value
inspired by the softlockup timeout.

Link: https://lore.kernel.org/r/20220610205038.GA3050413@paulmck-ThinkPad-P17-Gen-1
Link: https://lore.kernel.org/r/CAMdYzYpF4FNTBPZsEFeWRuEwSies36QM_As8osPWZSr2q-viEA@mail.gmail.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20220615162805.27962-3-pmladek@suse.com
---
 kernel/panic.c              |  2 ++
 kernel/printk/internal.h    |  2 ++
 kernel/printk/printk.c      |  4 ++++
 kernel/printk/printk_safe.c | 32 ++++++++++++++++++++++++++++++++
 kernel/reboot.c             |  2 ++
 5 files changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6737b2332275..fe73d18ecdf0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -273,6 +273,7 @@ void panic(const char *fmt, ...)
 		 * unfortunately means it may not be hardened to work in a
 		 * panic situation.
 		 */
+		try_block_console_kthreads(10000);
 		smp_send_stop();
 	} else {
 		/*
@@ -280,6 +281,7 @@ void panic(const char *fmt, ...)
 		 * kmsg_dump, we will need architecture dependent extra
 		 * works in addition to stopping other CPUs.
 		 */
+		try_block_console_kthreads(10000);
 		crash_smp_send_stop();
 	}
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index d947ca6c84f9..e7d8578860ad 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -20,6 +20,8 @@ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
+extern bool block_console_kthreads;
+
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 45c6c2b0b104..b095fb5f5f61 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -250,6 +250,9 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0);
 #define console_kthread_printing_exit() \
 	atomic_dec(&console_kthreads_active)
 
+/* Block console kthreads to avoid processing new messages. */
+bool block_console_kthreads;
+
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -3730,6 +3733,7 @@ static bool printer_should_wake(struct console *con, u64 seq)
 
 	if (con->blocked ||
 	    console_kthreads_atomically_blocked() ||
+	    block_console_kthreads ||
 	    system_state > SYSTEM_RUNNING ||
 	    oops_in_progress) {
 		return false;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da..caac4de1ea59 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -8,7 +8,9 @@
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/printk.h>
+#include <linux/console.h>
 #include <linux/kprobes.h>
+#include <linux/delay.h>
 
 #include "internal.h"
 
@@ -50,3 +52,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	return vprintk_default(fmt, args);
 }
 EXPORT_SYMBOL(vprintk);
+
+/**
+ * try_block_console_kthreads() - Try to block console kthreads and
+ *	make the global console_lock() avaialble
+ *
+ * @timeout_ms:        The maximum time (in ms) to wait.
+ *
+ * Prevent console kthreads from starting processing new messages. Wait
+ * until the global console_lock() become available.
+ *
+ * Context: Can be called in any context.
+ */
+void try_block_console_kthreads(int timeout_ms)
+{
+	block_console_kthreads = true;
+
+	/* Do not wait when the console lock could not be safely taken. */
+	if (this_cpu_read(printk_context) || in_nmi())
+		return;
+
+	while (timeout_ms > 0) {
+		if (console_trylock()) {
+			console_unlock();
+			return;
+		}
+
+		udelay(1000);
+		timeout_ms -= 1;
+	}
+}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 4177645e74d6..310363685502 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -74,6 +74,7 @@ void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
+	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
@@ -262,6 +263,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
 	system_state = state;
+	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
-- 
cgit v1.2.3


From ef79c396c664be99d0c5660dc75fe863c1e20315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= <cgzones@googlemail.com>
Date: Wed, 15 Jun 2022 17:44:31 +0200
Subject: audit: free module name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reset the type of the record last as the helper `audit_free_module()`
depends on it.

    unreferenced object 0xffff888153b707f0 (size 16):
      comm "modprobe", pid 1319, jiffies 4295110033 (age 1083.016s)
      hex dump (first 16 bytes):
        62 69 6e 66 6d 74 5f 6d 69 73 63 00 6b 6b 6b a5  binfmt_misc.kkk.
      backtrace:
        [<ffffffffa07dbf9b>] kstrdup+0x2b/0x50
        [<ffffffffa04b0a9d>] __audit_log_kern_module+0x4d/0xf0
        [<ffffffffa03b6664>] load_module+0x9d4/0x2e10
        [<ffffffffa03b8f44>] __do_sys_finit_module+0x114/0x1b0
        [<ffffffffa1f47124>] do_syscall_64+0x34/0x80
        [<ffffffffa200007e>] entry_SYSCALL_64_after_hwframe+0x46/0xb0

Cc: stable@vger.kernel.org
Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls")
Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3a2abd6d1a1..3a8c9d744800 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1014,10 +1014,10 @@ static void audit_reset_context(struct audit_context *ctx)
 	ctx->target_comm[0] = '\0';
 	unroll_tree_refs(ctx, NULL, 0);
 	WARN_ON(!list_empty(&ctx->killed_trees));
-	ctx->type = 0;
 	audit_free_module(ctx);
 	ctx->fds[0] = -1;
 	audit_proctitle_free(ctx);
+	ctx->type = 0; /* reset last for audit_free_*() */
 }
 
 static inline struct audit_context *audit_alloc_context(enum audit_state state)
-- 
cgit v1.2.3


From d25c83c6606ffc3abdf0868136ad3399f648ad70 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Tue, 15 Mar 2022 11:24:44 +0100
Subject: kthread: make it clear that kthread_create_on_node() might be
 terminated by any fatal signal

The comments in kernel/kthread.c create a feeling that only SIGKILL is
able to terminate the creation of kernel kthreads by
kthread_create()/_on_node()/_on_cpu() APIs.

In reality, wait_for_completion_killable() might be killed by any fatal
signal that does not have a custom handler:

	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

static inline void signal_wake_up(struct task_struct *t, bool resume)
{
	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
[...]
	/*
	 * Found a killable thread.  If the signal will be fatal,
	 * then start taking the whole group down immediately.
	 */
	if (sig_fatal(p, sig) ...) {
		if (!sig_kernel_coredump(sig)) {
		[...]
			do {
				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
				sigaddset(&t->pending.signal, SIGKILL);
				signal_wake_up(t, 1);
			} while_each_thread(p, t);
			return;
		}
	}
}

Update the comments in kernel/kthread.c to make this more obvious.

The motivation for this change was debugging why a module initialization
failed.  The module was being loaded from initrd.  It "magically" failed
when systemd was switching to the real root.  The clean up operations sent
SIGTERM to various pending processed that were started from initrd.

Link: https://lkml.kernel.org/r/20220315102444.2380-1-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kthread.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 544fd4097406..3c677918d8f2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -340,7 +340,7 @@ static int kthread(void *_create)
 
 	self = to_kthread(current);
 
-	/* If user was SIGKILLed, I release the structure. */
+	/* Release the structure when caller killed by a fatal signal. */
 	done = xchg(&create->done, NULL);
 	if (!done) {
 		kfree(create);
@@ -398,7 +398,7 @@ static void create_kthread(struct kthread_create_info *create)
 	/* We want our own signal handler (we take no signals by default). */
 	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
 	if (pid < 0) {
-		/* If user was SIGKILLed, I release the structure. */
+		/* Release the structure when caller killed by a fatal signal. */
 		struct completion *done = xchg(&create->done, NULL);
 
 		if (!done) {
@@ -440,9 +440,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	 */
 	if (unlikely(wait_for_completion_killable(&done))) {
 		/*
-		 * If I was SIGKILLed before kthreadd (or new kernel thread)
-		 * calls complete(), leave the cleanup of this structure to
-		 * that thread.
+		 * If I was killed by a fatal signal before kthreadd (or new
+		 * kernel thread) calls complete(), leave the cleanup of this
+		 * structure to that thread.
 		 */
 		if (xchg(&create->done, NULL))
 			return ERR_PTR(-EINTR);
@@ -876,7 +876,7 @@ fail_task:
  *
  * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
  * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
  */
 struct kthread_worker *
 kthread_create_worker(unsigned int flags, const char namefmt[], ...)
@@ -925,7 +925,7 @@ EXPORT_SYMBOL(kthread_create_worker);
  * Return:
  * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
  * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
- * when the worker was SIGKILLed.
+ * when the caller was killed by a fatal signal.
  */
 struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
-- 
cgit v1.2.3


From eb1b2985fe5c5f02e43e4c0d47bbe7ed835007f3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:16 +0200
Subject: ftrace: Keep address offset in ftrace_lookup_symbols

We want to store the resolved address on the same index as
the symbol string, because that's the user (bpf kprobe link)
code assumption.

Also making sure we don't store duplicates that might be
present in kallsyms.

Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Fixes: bed0d9a50dac ("ftrace: Add ftrace_lookup_symbols function")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/ftrace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e750fe141a60..601ccf1b2f09 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8029,15 +8029,23 @@ static int kallsyms_callback(void *data, const char *name,
 			     struct module *mod, unsigned long addr)
 {
 	struct kallsyms_data *args = data;
+	const char **sym;
+	int idx;
 
-	if (!bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp))
+	sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp);
+	if (!sym)
+		return 0;
+
+	idx = sym - args->syms;
+	if (args->addrs[idx])
 		return 0;
 
 	addr = ftrace_location(addr);
 	if (!addr)
 		return 0;
 
-	args->addrs[args->found++] = addr;
+	args->addrs[idx] = addr;
+	args->found++;
 	return args->found == args->cnt ? 1 : 0;
 }
 
@@ -8062,6 +8070,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a
 	struct kallsyms_data args;
 	int err;
 
+	memset(addrs, 0, sizeof(*addrs) * cnt);
 	args.addrs = addrs;
 	args.syms = sorted_syms;
 	args.cnt = cnt;
-- 
cgit v1.2.3


From eb5fb0325698d05f0bf78d322de82c451a3685a2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 15 Jun 2022 13:21:17 +0200
Subject: bpf: Force cookies array to follow symbols sorting

When user specifies symbols and cookies for kprobe_multi link
interface it's very likely the cookies will be misplaced and
returned to wrong functions (via get_attach_cookie helper).

The reason is that to resolve the provided functions we sort
them before passing them to ftrace_lookup_symbols, but we do
not do the same sort on the cookie values.

Fixing this by using sort_r function with custom swap callback
that swaps cookie values as well.

Fixes: 0236fec57a15 ("bpf: Resolve symbols with ftrace_lookup_symbols for kprobe multi link")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220615112118.497303-4-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 60 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 45 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a13e6ac6327..88589d74a892 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2423,7 +2423,7 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
 	kprobe_multi_link_prog_run(link, entry_ip, regs);
 }
 
-static int symbols_cmp(const void *a, const void *b)
+static int symbols_cmp_r(const void *a, const void *b, const void *priv)
 {
 	const char **str_a = (const char **) a;
 	const char **str_b = (const char **) b;
@@ -2431,6 +2431,28 @@ static int symbols_cmp(const void *a, const void *b)
 	return strcmp(*str_a, *str_b);
 }
 
+struct multi_symbols_sort {
+	const char **funcs;
+	u64 *cookies;
+};
+
+static void symbols_swap_r(void *a, void *b, int size, const void *priv)
+{
+	const struct multi_symbols_sort *data = priv;
+	const char **name_a = a, **name_b = b;
+
+	swap(*name_a, *name_b);
+
+	/* If defined, swap also related cookies. */
+	if (data->cookies) {
+		u64 *cookie_a, *cookie_b;
+
+		cookie_a = data->cookies + (name_a - data->funcs);
+		cookie_b = data->cookies + (name_b - data->funcs);
+		swap(*cookie_a, *cookie_b);
+	}
+}
+
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_kprobe_multi_link *link = NULL;
@@ -2468,38 +2490,46 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (!addrs)
 		return -ENOMEM;
 
+	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+	if (ucookies) {
+		cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
+		if (!cookies) {
+			err = -ENOMEM;
+			goto error;
+		}
+		if (copy_from_user(cookies, ucookies, size)) {
+			err = -EFAULT;
+			goto error;
+		}
+	}
+
 	if (uaddrs) {
 		if (copy_from_user(addrs, uaddrs, size)) {
 			err = -EFAULT;
 			goto error;
 		}
 	} else {
+		struct multi_symbols_sort data = {
+			.cookies = cookies,
+		};
 		struct user_syms us;
 
 		err = copy_user_syms(&us, usyms, cnt);
 		if (err)
 			goto error;
 
-		sort(us.syms, cnt, sizeof(*us.syms), symbols_cmp, NULL);
+		if (cookies)
+			data.funcs = us.syms;
+
+		sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
+		       symbols_swap_r, &data);
+
 		err = ftrace_lookup_symbols(us.syms, cnt, addrs);
 		free_user_syms(&us);
 		if (err)
 			goto error;
 	}
 
-	ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
-	if (ucookies) {
-		cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
-		if (!cookies) {
-			err = -ENOMEM;
-			goto error;
-		}
-		if (copy_from_user(cookies, ucookies, size)) {
-			err = -EFAULT;
-			goto error;
-		}
-	}
-
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link) {
 		err = -ENOMEM;
-- 
cgit v1.2.3


From 5cf9c91ba927119fc6606b938b1895bb2459d3bc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 14 Jun 2022 09:48:25 +0200
Subject: block: serialize all debugfs operations using q->debugfs_mutex

Various places like I/O schedulers or the QOS infrastructure try to
register debugfs files on demans, which can race with creating and
removing the main queue debugfs directory.  Use the existing
debugfs_mutex to serialize all debugfs operations that rely on
q->debugfs_dir or the directories hanging off it.

To make the teardown code a little simpler declare all debugfs dentry
pointers and not just the main one uncoditionally in blkdev.h.

Move debugfs_mutex next to the dentries that it protects and document
what it is used for.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 10a32b0f2deb..fe04c6f96ca5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
-	mutex_lock(&q->debugfs_mutex);
 	if (rcu_dereference_protected(q->blk_trace,
 				      lockdep_is_held(&q->debugfs_mutex))) {
 		__blk_trace_startstop(q, 0);
 		__blk_trace_remove(q);
 	}
-
-	mutex_unlock(&q->debugfs_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From c0f3bb4054ef036e5f67e27f2e3cad9e6512cf00 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 8 Jun 2022 01:11:12 +0900
Subject: rethook: Reject getting a rethook if RCU is not watching

Since the rethook_recycle() will involve the call_rcu() for reclaiming
the rethook_instance, the rethook must be set up at the RCU available
context (non idle). This rethook_recycle() in the rethook trampoline
handler is inevitable, thus the RCU available check must be done before
setting the rethook trampoline.

This adds a rcu_is_watching() check in the rethook_try_get() so that
it will return NULL if it is called when !rcu_is_watching().

Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/165461827269.280167.7379263615545598958.stgit@devnote2
---
 kernel/trace/rethook.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index b56833700d23..c69d82273ce7 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
 	if (unlikely(!handler))
 		return NULL;
 
+	/*
+	 * This expects the caller will set up a rethook on a function entry.
+	 * When the function returns, the rethook will eventually be reclaimed
+	 * or released in the rethook_recycle() with call_rcu().
+	 * This means the caller must be run in the RCU-availabe context.
+	 */
+	if (unlikely(!rcu_is_watching()))
+		return NULL;
+
 	fn = freelist_try_get(&rh->pool);
 	if (!fn)
 		return NULL;
-- 
cgit v1.2.3


From cc72b72073ac982a954d3b43519ca1c28f03c27c Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Sat, 28 May 2022 00:55:39 +0900
Subject: tracing/kprobes: Check whether get_kretprobe() returns NULL in
 kretprobe_dispatcher()

There is a small chance that get_kretprobe(ri) returns NULL in
kretprobe_dispatcher() when another CPU unregisters the kretprobe
right after __kretprobe_trampoline_handler().

To avoid this issue, kretprobe_dispatcher() checks the get_kretprobe()
return value again. And if it is NULL, it returns soon because that
kretprobe is under unregistering process.

This issue has been introduced when the kretprobe is decoupled
from the struct kretprobe_instance by commit d741bf41d7c7
("kprobes: Remove kretprobe hash"). Before that commit, the
struct kretprob_instance::rp directly points the kretprobe
and it is never be NULL.

Link: https://lkml.kernel.org/r/165366693881.797669.16926184644089588731.stgit@devnote2

Reported-by: Yonghong Song <yhs@fb.com>
Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash")
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: bpf <bpf@vger.kernel.org>
Cc: Kernel Team <kernel-team@fb.com>
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 93507330462c..a245ea673715 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1718,8 +1718,17 @@ static int
 kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	struct kretprobe *rp = get_kretprobe(ri);
-	struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp);
+	struct trace_kprobe *tk;
+
+	/*
+	 * There is a small chance that get_kretprobe(ri) returns NULL when
+	 * the kretprobe is unregister on another CPU between kretprobe's
+	 * trampoline_handler and this function.
+	 */
+	if (unlikely(!rp))
+		return 0;
 
+	tk = container_of(rp, struct trace_kprobe, rp);
 	raw_cpu_inc(*tk->nhit);
 
 	if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
-- 
cgit v1.2.3


From f4b0d318097e45cbac5e14976f8bb56aa2cef504 Mon Sep 17 00:00:00 2001
From: sunliming <sunliming@kylinos.cn>
Date: Thu, 2 Jun 2022 22:06:13 +0800
Subject: tracing: Simplify conditional compilation code in
 tracing_set_tracer()

Two conditional compilation directives "#ifdef CONFIG_TRACER_MAX_TRACE"
are used consecutively, and no other code in between. Simplify conditional
the compilation code and only use one "#ifdef CONFIG_TRACER_MAX_TRACE".

Link: https://lkml.kernel.org/r/20220602140613.545069-1-sunliming@kylinos.cn

Signed-off-by: sunliming <sunliming@kylinos.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c95992e2c71..a8cfac0611bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6424,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		synchronize_rcu();
 		free_snapshot(tr);
 	}
-#endif
 
-#ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
 		ret = tracing_alloc_snapshot_instance(tr);
 		if (ret < 0)
-- 
cgit v1.2.3


From 12c3e0c92fd7cb3d3b698d84fdde7dccb6ba8822 Mon Sep 17 00:00:00 2001
From: Gautam Menghani <gautammenghani201@gmail.com>
Date: Sun, 12 Jun 2022 07:42:32 -0700
Subject: tracing/uprobes: Remove unwanted initialization in
 __trace_uprobe_create()

Remove the unwanted initialization of variable 'ret'. This fixes the clang
scan warning: Value stored to 'ret' is never read [deadcode.DeadStores]

Link: https://lkml.kernel.org/r/20220612144232.145209-1-gautammenghani201@gmail.com

Signed-off-by: Gautam Menghani <gautammenghani201@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..c3dc4f859a6b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -546,7 +546,6 @@ static int __trace_uprobe_create(int argc, const char **argv)
 	bool is_return = false;
 	int i, ret;
 
-	ret = 0;
 	ref_ctr_offset = 0;
 
 	switch (argv[0][0]) {
-- 
cgit v1.2.3


From 202773260023b56e868d09d13d3a417028f1ff5b Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Date: Fri, 17 Jun 2022 15:24:02 +0300
Subject: PM: hibernate: Use kernel_can_power_off()

Use new kernel_can_power_off() API instead of legacy pm_power_off global
variable to fix regressed hibernation to disk where machine no longer
powers off when it should because ACPI power driver transitioned to the
new sys-off based API and it doesn't use pm_power_off anymore.

Fixes: 98f30d0ecf79 ("ACPI: power: Switch to sys-off handler API")
Tested-by: Ken Moffat <zarniwhoop@ntlworld.com>
Reported-by: Ken Moffat <zarniwhhop@ntlworld.com>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 20a66bf9f465..89c71fce225d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -665,7 +665,7 @@ static void power_down(void)
 		hibernation_platform_enter();
 		fallthrough;
 	case HIBERNATION_SHUTDOWN:
-		if (pm_power_off)
+		if (kernel_can_power_off())
 			kernel_power_off();
 		break;
 	}
-- 
cgit v1.2.3


From 3be4562584bba603f33863a00c1c32eecf772ee6 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Wed, 22 Jun 2022 12:14:24 -0700
Subject: dma-direct: use the correct size for dma_set_encrypted()

The third parameter of dma_set_encrypted() is a size in bytes rather than
the number of pages.

Fixes: 4d0564785bb0 ("dma-direct: factor out dma_set_{de,en}crypted helpers")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e978f36e6be8..8d0b68a17042 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -357,7 +357,7 @@ void dma_direct_free(struct device *dev, size_t size,
 	} else {
 		if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
 			arch_dma_clear_uncached(cpu_addr, size);
-		if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+		if (dma_set_encrypted(dev, cpu_addr, size))
 			return;
 	}
 
@@ -392,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 		struct page *page, dma_addr_t dma_addr,
 		enum dma_data_direction dir)
 {
-	unsigned int page_order = get_order(size);
 	void *vaddr = page_address(page);
 
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -400,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 	    dma_free_from_pool(dev, vaddr, size))
 		return;
 
-	if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+	if (dma_set_encrypted(dev, vaddr, size))
 		return;
 	__dma_direct_free_pages(dev, page, size);
 }
-- 
cgit v1.2.3


From 20fb0c8272bbb102d15bdd11aa64f828619dd7cc Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:52 +0200
Subject: Revert "printk: Wait for the global console lock when the system is
 going down"

This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-2-pmladek@suse.com
---
 kernel/panic.c              |  2 --
 kernel/printk/internal.h    |  2 --
 kernel/printk/printk.c      |  4 ----
 kernel/printk/printk_safe.c | 32 --------------------------------
 kernel/reboot.c             |  2 --
 5 files changed, 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index fe73d18ecdf0..6737b2332275 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -273,7 +273,6 @@ void panic(const char *fmt, ...)
 		 * unfortunately means it may not be hardened to work in a
 		 * panic situation.
 		 */
-		try_block_console_kthreads(10000);
 		smp_send_stop();
 	} else {
 		/*
@@ -281,7 +280,6 @@ void panic(const char *fmt, ...)
 		 * kmsg_dump, we will need architecture dependent extra
 		 * works in addition to stopping other CPUs.
 		 */
-		try_block_console_kthreads(10000);
 		crash_smp_send_stop();
 	}
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index e7d8578860ad..d947ca6c84f9 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -20,8 +20,6 @@ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
-extern bool block_console_kthreads;
-
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b095fb5f5f61..45c6c2b0b104 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -250,9 +250,6 @@ static atomic_t console_kthreads_active = ATOMIC_INIT(0);
 #define console_kthread_printing_exit() \
 	atomic_dec(&console_kthreads_active)
 
-/* Block console kthreads to avoid processing new messages. */
-bool block_console_kthreads;
-
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -3733,7 +3730,6 @@ static bool printer_should_wake(struct console *con, u64 seq)
 
 	if (con->blocked ||
 	    console_kthreads_atomically_blocked() ||
-	    block_console_kthreads ||
 	    system_state > SYSTEM_RUNNING ||
 	    oops_in_progress) {
 		return false;
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index caac4de1ea59..ef0f9a2044da 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -8,9 +8,7 @@
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/printk.h>
-#include <linux/console.h>
 #include <linux/kprobes.h>
-#include <linux/delay.h>
 
 #include "internal.h"
 
@@ -52,33 +50,3 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	return vprintk_default(fmt, args);
 }
 EXPORT_SYMBOL(vprintk);
-
-/**
- * try_block_console_kthreads() - Try to block console kthreads and
- *	make the global console_lock() avaialble
- *
- * @timeout_ms:        The maximum time (in ms) to wait.
- *
- * Prevent console kthreads from starting processing new messages. Wait
- * until the global console_lock() become available.
- *
- * Context: Can be called in any context.
- */
-void try_block_console_kthreads(int timeout_ms)
-{
-	block_console_kthreads = true;
-
-	/* Do not wait when the console lock could not be safely taken. */
-	if (this_cpu_read(printk_context) || in_nmi())
-		return;
-
-	while (timeout_ms > 0) {
-		if (console_trylock()) {
-			console_unlock();
-			return;
-		}
-
-		udelay(1000);
-		timeout_ms -= 1;
-	}
-}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 310363685502..4177645e74d6 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -74,7 +74,6 @@ void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
-	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
@@ -263,7 +262,6 @@ static void kernel_shutdown_prepare(enum system_states state)
 	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
 	system_state = state;
-	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
-- 
cgit v1.2.3


From 05c96b3713aa2a406d0c4ef0505bf80a6748fedb Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:53 +0200
Subject: Revert "printk: Block console kthreads when direct printing will be
 required"

This reverts commit c3230283e2819a69dad2cf7a63143fde8bab8b5c.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-3-pmladek@suse.com
---
 kernel/printk/printk.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 45c6c2b0b104..ea3dd55709e7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3729,9 +3729,7 @@ static bool printer_should_wake(struct console *con, u64 seq)
 		return true;
 
 	if (con->blocked ||
-	    console_kthreads_atomically_blocked() ||
-	    system_state > SYSTEM_RUNNING ||
-	    oops_in_progress) {
+	    console_kthreads_atomically_blocked()) {
 		return false;
 	}
 
-- 
cgit v1.2.3


From 007eeab7e9f03a3108c300c03e11a6c151e430c9 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:54 +0200
Subject: Revert "printk: remove @console_locked"

This reverts commit ab406816fca009349b89cbde885daf68a8c77e33.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-4-pmladek@suse.com
---
 kernel/printk/printk.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea3dd55709e7..dfd1a19b95d6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -340,7 +340,15 @@ static void console_kthreads_unblock(void)
 	console_kthreads_blocked = false;
 }
 
-static int console_suspended;
+/*
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console semaphore held).
+ */
+static int console_locked, console_suspended;
 
 /*
  *	Array of consoles built from command line options (console=)
@@ -2711,6 +2719,7 @@ void console_lock(void)
 	if (console_suspended)
 		return;
 	console_kthreads_block();
+	console_locked = 1;
 	console_may_schedule = 1;
 }
 EXPORT_SYMBOL(console_lock);
@@ -2735,26 +2744,15 @@ int console_trylock(void)
 		up_console_sem();
 		return 0;
 	}
+	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
 }
 EXPORT_SYMBOL(console_trylock);
 
-/*
- * This is used to help to make sure that certain paths within the VT code are
- * running with the console lock held. It is definitely not the perfect debug
- * tool (it is not known if the VT code is the task holding the console lock),
- * but it helps tracking those weird code paths in the console code such as
- * when the console is suspended: where the console is not locked but no
- * console printing may occur.
- *
- * Note: This returns true when the console is suspended but is not locked.
- *       This is intentional because the VT code must consider that situation
- *       the same as if the console was locked.
- */
 int is_console_locked(void)
 {
-	return (console_kthreads_blocked || atomic_read(&console_kthreads_active));
+	return (console_locked || atomic_read(&console_kthreads_active));
 }
 EXPORT_SYMBOL(is_console_locked);
 
@@ -2810,6 +2808,8 @@ static inline bool console_is_usable(struct console *con)
 
 static void __console_unlock(void)
 {
+	console_locked = 0;
+
 	/*
 	 * Depending on whether console_lock() or console_trylock() was used,
 	 * appropriately allow the kthread printers to continue.
@@ -3127,6 +3127,7 @@ void console_unblank(void)
 	} else
 		console_lock();
 
+	console_locked = 1;
 	console_may_schedule = 0;
 	for_each_console(c)
 		if ((c->flags & CON_ENABLED) && c->unblank)
-- 
cgit v1.2.3


From 2d9ef940f89e0ab4fde7ba6f769d82f2a450c35a Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:55 +0200
Subject: Revert "printk: extend console_lock for per-console locking"

This reverts commit 8e274732115f63c1d09136284431b3555bd5cc56.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-5-pmladek@suse.com
---
 kernel/printk/printk.c | 261 +++++++++++--------------------------------------
 1 file changed, 56 insertions(+), 205 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index dfd1a19b95d6..ae489dc685a1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -223,33 +223,6 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 /* Number of registered extended console drivers. */
 static int nr_ext_console_drivers;
 
-/*
- * Used to synchronize printing kthreads against direct printing via
- * console_trylock/console_unlock.
- *
- * Values:
- * -1 = console kthreads atomically blocked (via global trylock)
- *  0 = no kthread printing, console not locked (via trylock)
- * >0 = kthread(s) actively printing
- *
- * Note: For synchronizing against direct printing via
- *       console_lock/console_unlock, see the @lock variable in
- *       struct console.
- */
-static atomic_t console_kthreads_active = ATOMIC_INIT(0);
-
-#define console_kthreads_atomic_tryblock() \
-	(atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0)
-#define console_kthreads_atomic_unblock() \
-	atomic_cmpxchg(&console_kthreads_active, -1, 0)
-#define console_kthreads_atomically_blocked() \
-	(atomic_read(&console_kthreads_active) == -1)
-
-#define console_kthread_printing_tryenter() \
-	atomic_inc_unless_negative(&console_kthreads_active)
-#define console_kthread_printing_exit() \
-	atomic_dec(&console_kthreads_active)
-
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -297,49 +270,6 @@ static bool panic_in_progress(void)
 	return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
 }
 
-/*
- * Tracks whether kthread printers are all blocked. A value of true implies
- * that the console is locked via console_lock() or the console is suspended.
- * Writing to this variable requires holding @console_sem.
- */
-static bool console_kthreads_blocked;
-
-/*
- * Block all kthread printers from a schedulable context.
- *
- * Requires holding @console_sem.
- */
-static void console_kthreads_block(void)
-{
-	struct console *con;
-
-	for_each_console(con) {
-		mutex_lock(&con->lock);
-		con->blocked = true;
-		mutex_unlock(&con->lock);
-	}
-
-	console_kthreads_blocked = true;
-}
-
-/*
- * Unblock all kthread printers from a schedulable context.
- *
- * Requires holding @console_sem.
- */
-static void console_kthreads_unblock(void)
-{
-	struct console *con;
-
-	for_each_console(con) {
-		mutex_lock(&con->lock);
-		con->blocked = false;
-		mutex_unlock(&con->lock);
-	}
-
-	console_kthreads_blocked = false;
-}
-
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
@@ -2673,6 +2603,13 @@ void resume_console(void)
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
+
+	/*
+	 * While suspended, new records may have been added to the
+	 * ringbuffer. Wake up the kthread printers to print them.
+	 */
+	wake_up_klogd();
+
 	pr_flush(1000, true);
 }
 
@@ -2691,14 +2628,9 @@ static int console_cpu_notify(unsigned int cpu)
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
-		else {
-			/*
-			 * If a new CPU comes online, the conditions for
-			 * printer_should_wake() may have changed for some
-			 * kthread printer with !CON_ANYTIME.
-			 */
-			wake_up_klogd();
-		}
+
+		/* Wake kthread printers. Some may have become usable. */
+		wake_up_klogd();
 	}
 	return 0;
 }
@@ -2718,7 +2650,6 @@ void console_lock(void)
 	down_console_sem();
 	if (console_suspended)
 		return;
-	console_kthreads_block();
 	console_locked = 1;
 	console_may_schedule = 1;
 }
@@ -2740,10 +2671,6 @@ int console_trylock(void)
 		up_console_sem();
 		return 0;
 	}
-	if (!console_kthreads_atomic_tryblock()) {
-		up_console_sem();
-		return 0;
-	}
 	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
@@ -2752,7 +2679,7 @@ EXPORT_SYMBOL(console_trylock);
 
 int is_console_locked(void)
 {
-	return (console_locked || atomic_read(&console_kthreads_active));
+	return console_locked;
 }
 EXPORT_SYMBOL(is_console_locked);
 
@@ -2796,7 +2723,7 @@ static inline bool __console_is_usable(short flags)
  * Check if the given console is currently capable and allowed to print
  * records.
  *
- * Requires holding the console_lock.
+ * Requires the console_lock.
  */
 static inline bool console_is_usable(struct console *con)
 {
@@ -2809,22 +2736,6 @@ static inline bool console_is_usable(struct console *con)
 static void __console_unlock(void)
 {
 	console_locked = 0;
-
-	/*
-	 * Depending on whether console_lock() or console_trylock() was used,
-	 * appropriately allow the kthread printers to continue.
-	 */
-	if (console_kthreads_blocked)
-		console_kthreads_unblock();
-	else
-		console_kthreads_atomic_unblock();
-
-	/*
-	 * New records may have arrived while the console was locked.
-	 * Wake the kthread printers to print them.
-	 */
-	wake_up_klogd();
-
 	up_console_sem();
 }
 
@@ -2842,19 +2753,17 @@ static void __console_unlock(void)
  *
  * @handover will be set to true if a printk waiter has taken over the
  * console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false. A NULL pointer may be provided
- * to disable allowing the console_lock to be taken over by a printk waiter.
+ * console_lock. Otherwise it is set to false.
  *
  * Returns false if the given console has no next record to print, otherwise
  * true.
  *
- * Requires the console_lock if @handover is non-NULL.
- * Requires con->lock otherwise.
+ * Requires the console_lock.
  */
-static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
-				       char *dropped_text, bool *handover)
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+				     char *dropped_text, bool *handover)
 {
-	static atomic_t panic_console_dropped = ATOMIC_INIT(0);
+	static int panic_console_dropped;
 	struct printk_info info;
 	struct printk_record r;
 	unsigned long flags;
@@ -2863,8 +2772,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
 
 	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
 
-	if (handover)
-		*handover = false;
+	*handover = false;
 
 	if (!prb_read_valid(prb, con->seq, &r))
 		return false;
@@ -2872,8 +2780,7 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
 	if (con->seq != r.info->seq) {
 		con->dropped += r.info->seq - con->seq;
 		con->seq = r.info->seq;
-		if (panic_in_progress() &&
-		    atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
+		if (panic_in_progress() && panic_console_dropped++ > 10) {
 			suppress_panic_printk = 1;
 			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
 		}
@@ -2895,61 +2802,31 @@ static bool __console_emit_next_record(struct console *con, char *text, char *ex
 		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
 	}
 
-	if (handover) {
-		/*
-		 * While actively printing out messages, if another printk()
-		 * were to occur on another CPU, it may wait for this one to
-		 * finish. This task can not be preempted if there is a
-		 * waiter waiting to take over.
-		 *
-		 * Interrupts are disabled because the hand over to a waiter
-		 * must not be interrupted until the hand over is completed
-		 * (@console_waiter is cleared).
-		 */
-		printk_safe_enter_irqsave(flags);
-		console_lock_spinning_enable();
-
-		/* don't trace irqsoff print latency */
-		stop_critical_timings();
-	}
+	/*
+	 * While actively printing out messages, if another printk()
+	 * were to occur on another CPU, it may wait for this one to
+	 * finish. This task can not be preempted if there is a
+	 * waiter waiting to take over.
+	 *
+	 * Interrupts are disabled because the hand over to a waiter
+	 * must not be interrupted until the hand over is completed
+	 * (@console_waiter is cleared).
+	 */
+	printk_safe_enter_irqsave(flags);
+	console_lock_spinning_enable();
 
+	stop_critical_timings();	/* don't trace print latency */
 	call_console_driver(con, write_text, len, dropped_text);
+	start_critical_timings();
 
 	con->seq++;
 
-	if (handover) {
-		start_critical_timings();
-		*handover = console_lock_spinning_disable_and_check();
-		printk_safe_exit_irqrestore(flags);
-	}
+	*handover = console_lock_spinning_disable_and_check();
+	printk_safe_exit_irqrestore(flags);
 skip:
 	return true;
 }
 
-/*
- * Print a record for a given console, but allow another printk() caller to
- * take over the console_lock and continue printing.
- *
- * Requires the console_lock, but depending on @handover after the call, the
- * caller may no longer have the console_lock.
- *
- * See __console_emit_next_record() for argument and return details.
- */
-static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text,
-						  char *dropped_text, bool *handover)
-{
-	/*
-	 * Handovers are only supported if threaded printers are atomically
-	 * blocked. The context taking over the console_lock may be atomic.
-	 */
-	if (!console_kthreads_atomically_blocked()) {
-		*handover = false;
-		handover = NULL;
-	}
-
-	return __console_emit_next_record(con, text, ext_text, dropped_text, handover);
-}
-
 /*
  * Print out all remaining records to all consoles.
  *
@@ -3001,11 +2878,13 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 
 			if (con->flags & CON_EXTENDED) {
 				/* Extended consoles do not print "dropped messages". */
-				progress = console_emit_next_record_transferable(con, &text[0],
-								&ext_text[0], NULL, handover);
+				progress = console_emit_next_record(con, &text[0],
+								    &ext_text[0], NULL,
+								    handover);
 			} else {
-				progress = console_emit_next_record_transferable(con, &text[0],
-								NULL, &dropped_text[0], handover);
+				progress = console_emit_next_record(con, &text[0],
+								    NULL, &dropped_text[0],
+								    handover);
 			}
 			if (*handover)
 				return false;
@@ -3120,10 +2999,6 @@ void console_unblank(void)
 	if (oops_in_progress) {
 		if (down_trylock_console_sem() != 0)
 			return;
-		if (!console_kthreads_atomic_tryblock()) {
-			up_console_sem();
-			return;
-		}
 	} else
 		console_lock();
 
@@ -3206,6 +3081,10 @@ void console_start(struct console *console)
 	console_lock();
 	console->flags |= CON_ENABLED;
 	console_unlock();
+
+	/* Wake the newly enabled kthread printer. */
+	wake_up_klogd();
+
 	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
@@ -3407,8 +3286,6 @@ void register_console(struct console *newcon)
 
 	newcon->dropped = 0;
 	newcon->thread = NULL;
-	newcon->blocked = true;
-	mutex_init(&newcon->lock);
 
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/* Get a consistent copy of @syslog_seq. */
@@ -3709,19 +3586,6 @@ static void printk_fallback_preferred_direct(void)
 	console_unlock();
 }
 
-/*
- * Print a record for a given console, not allowing another printk() caller
- * to take over. This is appropriate for contexts that do not have the
- * console_lock.
- *
- * See __console_emit_next_record() for argument and return details.
- */
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
-				     char *dropped_text)
-{
-	return __console_emit_next_record(con, text, ext_text, dropped_text, NULL);
-}
-
 static bool printer_should_wake(struct console *con, u64 seq)
 {
 	short flags;
@@ -3729,10 +3593,8 @@ static bool printer_should_wake(struct console *con, u64 seq)
 	if (kthread_should_stop() || !printk_kthreads_available)
 		return true;
 
-	if (con->blocked ||
-	    console_kthreads_atomically_blocked()) {
+	if (console_suspended)
 		return false;
-	}
 
 	/*
 	 * This is an unsafe read from con->flags, but a false positive is
@@ -3753,6 +3615,7 @@ static int printk_kthread_func(void *data)
 	struct console *con = data;
 	char *dropped_text = NULL;
 	char *ext_text = NULL;
+	bool handover;
 	u64 seq = 0;
 	char *text;
 	int error;
@@ -3802,27 +3665,15 @@ static int printk_kthread_func(void *data)
 		if (error)
 			continue;
 
-		error = mutex_lock_interruptible(&con->lock);
-		if (error)
-			continue;
+		console_lock();
 
-		if (con->blocked ||
-		    !console_kthread_printing_tryenter()) {
-			/* Another context has locked the console_lock. */
-			mutex_unlock(&con->lock);
+		if (console_suspended) {
+			up_console_sem();
 			continue;
 		}
 
-		/*
-		 * Although this context has not locked the console_lock, it
-		 * is known that the console_lock is not locked and it is not
-		 * possible for any other context to lock the console_lock.
-		 * Therefore it is safe to read con->flags.
-		 */
-
-		if (!__console_is_usable(con->flags)) {
-			console_kthread_printing_exit();
-			mutex_unlock(&con->lock);
+		if (!console_is_usable(con)) {
+			__console_unlock();
 			continue;
 		}
 
@@ -3835,13 +3686,13 @@ static int printk_kthread_func(void *data)
 		 * which can conditionally invoke cond_resched().
 		 */
 		console_may_schedule = 0;
-		console_emit_next_record(con, text, ext_text, dropped_text);
+		console_emit_next_record(con, text, ext_text, dropped_text, &handover);
+		if (handover)
+			continue;
 
 		seq = con->seq;
 
-		console_kthread_printing_exit();
-
-		mutex_unlock(&con->lock);
+		__console_unlock();
 	}
 
 	con_printk(KERN_INFO, con, "printing thread stopped\n");
-- 
cgit v1.2.3


From 5831788afb17b89c5b531fb60cbd798613ccbb63 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:56 +0200
Subject: Revert "printk: add kthread console printers"

This reverts commit 09c5ba0aa2fcfdadb17d045c3ee6f86d69270df7.

This reverts commit b87f02307d3cfbda768520f0687c51ca77e14fc3.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-6-pmladek@suse.com
---
 kernel/printk/printk.c | 329 ++++---------------------------------------------
 1 file changed, 22 insertions(+), 307 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ae489dc685a1..5a6273e56b4e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -361,13 +361,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 /* syslog_lock protects syslog_* variables and write access to clear_seq. */
 static DEFINE_MUTEX(syslog_lock);
 
-/*
- * A flag to signify if printk_activate_kthreads() has already started the
- * kthread printers. If true, any later registered consoles must start their
- * own kthread directly. The flag is write protected by the console_lock.
- */
-static bool printk_kthreads_available;
-
 #ifdef CONFIG_PRINTK
 static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
 
@@ -397,39 +390,6 @@ void printk_prefer_direct_exit(void)
 	WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
 }
 
-/*
- * Calling printk() always wakes kthread printers so that they can
- * flush the new message to their respective consoles. Also, if direct
- * printing is allowed, printk() tries to flush the messages directly.
- *
- * Direct printing is allowed in situations when the kthreads
- * are not available or the system is in a problematic state.
- *
- * See the implementation about possible races.
- */
-static inline bool allow_direct_printing(void)
-{
-	/*
-	 * Checking kthread availability is a possible race because the
-	 * kthread printers can become permanently disabled during runtime.
-	 * However, doing that requires holding the console_lock, so any
-	 * pending messages will be direct printed by console_unlock().
-	 */
-	if (!printk_kthreads_available)
-		return true;
-
-	/*
-	 * Prefer direct printing when the system is in a problematic state.
-	 * The context that sets this state will always see the updated value.
-	 * The other contexts do not care. Anyway, direct printing is just a
-	 * best effort. The direct output is only possible when console_lock
-	 * is not already taken and no kthread printers are actively printing.
-	 */
-	return (system_state > SYSTEM_RUNNING ||
-		oops_in_progress ||
-		atomic_read(&printk_prefer_direct));
-}
-
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -2320,10 +2280,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
 
 	/* If called from the scheduler, we can not call up(). */
-	if (!in_sched && allow_direct_printing()) {
+	if (!in_sched) {
 		/*
 		 * The caller may be holding system-critical or
-		 * timing-sensitive locks. Disable preemption during direct
+		 * timing-sensitive locks. Disable preemption during
 		 * printing of all remaining records to all consoles so that
 		 * this context can return as soon as possible. Hopefully
 		 * another printk() caller will take over the printing.
@@ -2366,8 +2326,6 @@ EXPORT_SYMBOL(_printk);
 
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
 
-static void printk_start_kthread(struct console *con);
-
 #else /* CONFIG_PRINTK */
 
 #define CONSOLE_LOG_MAX		0
@@ -2401,8 +2359,6 @@ static void call_console_driver(struct console *con, const char *text, size_t le
 }
 static bool suppress_message_printing(int level) { return false; }
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
-static void printk_start_kthread(struct console *con) { }
-static bool allow_direct_printing(void) { return true; }
 
 #endif /* CONFIG_PRINTK */
 
@@ -2603,13 +2559,6 @@ void resume_console(void)
 	down_console_sem();
 	console_suspended = 0;
 	console_unlock();
-
-	/*
-	 * While suspended, new records may have been added to the
-	 * ringbuffer. Wake up the kthread printers to print them.
-	 */
-	wake_up_klogd();
-
 	pr_flush(1000, true);
 }
 
@@ -2628,9 +2577,6 @@ static int console_cpu_notify(unsigned int cpu)
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
-
-		/* Wake kthread printers. Some may have become usable. */
-		wake_up_klogd();
 	}
 	return 0;
 }
@@ -2702,9 +2648,18 @@ static bool abandon_console_lock_in_panic(void)
 	return atomic_read(&panic_cpu) != raw_smp_processor_id();
 }
 
-static inline bool __console_is_usable(short flags)
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires the console_lock.
+ */
+static inline bool console_is_usable(struct console *con)
 {
-	if (!(flags & CON_ENABLED))
+	if (!(con->flags & CON_ENABLED))
+		return false;
+
+	if (!con->write)
 		return false;
 
 	/*
@@ -2713,26 +2668,12 @@ static inline bool __console_is_usable(short flags)
 	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
 	 */
 	if (!cpu_online(raw_smp_processor_id()) &&
-	    !(flags & CON_ANYTIME))
+	    !(con->flags & CON_ANYTIME))
 		return false;
 
 	return true;
 }
 
-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_lock.
- */
-static inline bool console_is_usable(struct console *con)
-{
-	if (!con->write)
-		return false;
-
-	return __console_is_usable(con->flags);
-}
-
 static void __console_unlock(void)
 {
 	console_locked = 0;
@@ -2845,8 +2786,8 @@ skip:
  * were flushed to all usable consoles. A returned false informs the caller
  * that everything was not flushed (either there were no usable consoles or
  * another context has taken over printing or it is a panic situation and this
- * is not the panic CPU or direct printing is not preferred). Regardless the
- * reason, the caller should assume it is not useful to immediately try again.
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
  *
  * Requires the console_lock.
  */
@@ -2863,10 +2804,6 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 	*handover = false;
 
 	do {
-		/* Let the kthread printers do the work if they can. */
-		if (!allow_direct_printing())
-			return false;
-
 		any_progress = false;
 
 		for_each_console(con) {
@@ -3081,10 +3018,6 @@ void console_start(struct console *console)
 	console_lock();
 	console->flags |= CON_ENABLED;
 	console_unlock();
-
-	/* Wake the newly enabled kthread printer. */
-	wake_up_klogd();
-
 	__pr_flush(console, 1000, true);
 }
 EXPORT_SYMBOL(console_start);
@@ -3285,8 +3218,6 @@ void register_console(struct console *newcon)
 		nr_ext_console_drivers++;
 
 	newcon->dropped = 0;
-	newcon->thread = NULL;
-
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/* Get a consistent copy of @syslog_seq. */
 		mutex_lock(&syslog_lock);
@@ -3296,10 +3227,6 @@ void register_console(struct console *newcon)
 		/* Begin with next message. */
 		newcon->seq = prb_next_seq(prb);
 	}
-
-	if (printk_kthreads_available)
-		printk_start_kthread(newcon);
-
 	console_unlock();
 	console_sysfs_notify();
 
@@ -3326,7 +3253,6 @@ EXPORT_SYMBOL(register_console);
 
 int unregister_console(struct console *console)
 {
-	struct task_struct *thd;
 	struct console *con;
 	int res;
 
@@ -3367,20 +3293,7 @@ int unregister_console(struct console *console)
 		console_drivers->flags |= CON_CONSDEV;
 
 	console->flags &= ~CON_ENABLED;
-
-	/*
-	 * console->thread can only be cleared under the console lock. But
-	 * stopping the thread must be done without the console lock. The
-	 * task that clears @thread is the task that stops the kthread.
-	 */
-	thd = console->thread;
-	console->thread = NULL;
-
 	console_unlock();
-
-	if (thd)
-		kthread_stop(thd);
-
 	console_sysfs_notify();
 
 	if (console->exit)
@@ -3476,20 +3389,6 @@ static int __init printk_late_init(void)
 }
 late_initcall(printk_late_init);
 
-static int __init printk_activate_kthreads(void)
-{
-	struct console *con;
-
-	console_lock();
-	printk_kthreads_available = true;
-	for_each_console(con)
-		printk_start_kthread(con);
-	console_unlock();
-
-	return 0;
-}
-early_initcall(printk_activate_kthreads);
-
 #if defined CONFIG_PRINTK
 /* If @con is specified, only wait for that console. Otherwise wait for all. */
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
@@ -3564,180 +3463,11 @@ bool pr_flush(int timeout_ms, bool reset_on_progress)
 }
 EXPORT_SYMBOL(pr_flush);
 
-static void __printk_fallback_preferred_direct(void)
-{
-	printk_prefer_direct_enter();
-	pr_err("falling back to preferred direct printing\n");
-	printk_kthreads_available = false;
-}
-
-/*
- * Enter preferred direct printing, but never exit. Mark console threads as
- * unavailable. The system is then forever in preferred direct printing and
- * any printing threads will exit.
- *
- * Must *not* be called under console_lock. Use
- * __printk_fallback_preferred_direct() if already holding console_lock.
- */
-static void printk_fallback_preferred_direct(void)
-{
-	console_lock();
-	__printk_fallback_preferred_direct();
-	console_unlock();
-}
-
-static bool printer_should_wake(struct console *con, u64 seq)
-{
-	short flags;
-
-	if (kthread_should_stop() || !printk_kthreads_available)
-		return true;
-
-	if (console_suspended)
-		return false;
-
-	/*
-	 * This is an unsafe read from con->flags, but a false positive is
-	 * not a problem. Worst case it would allow the printer to wake up
-	 * although it is disabled. But the printer will notice that when
-	 * attempting to print and instead go back to sleep.
-	 */
-	flags = data_race(READ_ONCE(con->flags));
-
-	if (!__console_is_usable(flags))
-		return false;
-
-	return prb_read_valid(prb, seq, NULL);
-}
-
-static int printk_kthread_func(void *data)
-{
-	struct console *con = data;
-	char *dropped_text = NULL;
-	char *ext_text = NULL;
-	bool handover;
-	u64 seq = 0;
-	char *text;
-	int error;
-
-	text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
-	if (!text) {
-		con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
-		printk_fallback_preferred_direct();
-		goto out;
-	}
-
-	if (con->flags & CON_EXTENDED) {
-		ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
-		if (!ext_text) {
-			con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n");
-			printk_fallback_preferred_direct();
-			goto out;
-		}
-	} else {
-		dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
-		if (!dropped_text) {
-			con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n");
-			printk_fallback_preferred_direct();
-			goto out;
-		}
-	}
-
-	con_printk(KERN_INFO, con, "printing thread started\n");
-
-	for (;;) {
-		/*
-		 * Guarantee this task is visible on the waitqueue before
-		 * checking the wake condition.
-		 *
-		 * The full memory barrier within set_current_state() of
-		 * prepare_to_wait_event() pairs with the full memory barrier
-		 * within wq_has_sleeper().
-		 *
-		 * This pairs with __wake_up_klogd:A.
-		 */
-		error = wait_event_interruptible(log_wait,
-				printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */
-
-		if (kthread_should_stop() || !printk_kthreads_available)
-			break;
-
-		if (error)
-			continue;
-
-		console_lock();
-
-		if (console_suspended) {
-			up_console_sem();
-			continue;
-		}
-
-		if (!console_is_usable(con)) {
-			__console_unlock();
-			continue;
-		}
-
-		/*
-		 * Even though the printk kthread is always preemptible, it is
-		 * still not allowed to call cond_resched() from within
-		 * console drivers. The task may become non-preemptible in the
-		 * console driver call chain. For example, vt_console_print()
-		 * takes a spinlock and then can call into fbcon_redraw(),
-		 * which can conditionally invoke cond_resched().
-		 */
-		console_may_schedule = 0;
-		console_emit_next_record(con, text, ext_text, dropped_text, &handover);
-		if (handover)
-			continue;
-
-		seq = con->seq;
-
-		__console_unlock();
-	}
-
-	con_printk(KERN_INFO, con, "printing thread stopped\n");
-out:
-	kfree(dropped_text);
-	kfree(ext_text);
-	kfree(text);
-
-	console_lock();
-	/*
-	 * If this kthread is being stopped by another task, con->thread will
-	 * already be NULL. That is fine. The important thing is that it is
-	 * NULL after the kthread exits.
-	 */
-	con->thread = NULL;
-	console_unlock();
-
-	return 0;
-}
-
-/* Must be called under console_lock. */
-static void printk_start_kthread(struct console *con)
-{
-	/*
-	 * Do not start a kthread if there is no write() callback. The
-	 * kthreads assume the write() callback exists.
-	 */
-	if (!con->write)
-		return;
-
-	con->thread = kthread_run(printk_kthread_func, con,
-				  "pr/%s%d", con->name, con->index);
-	if (IS_ERR(con->thread)) {
-		con->thread = NULL;
-		con_printk(KERN_ERR, con, "unable to start printing thread\n");
-		__printk_fallback_preferred_direct();
-		return;
-	}
-}
-
 /*
  * Delayed printk version, for scheduler-internal messages:
  */
-#define PRINTK_PENDING_WAKEUP		0x01
-#define PRINTK_PENDING_DIRECT_OUTPUT	0x02
+#define PRINTK_PENDING_WAKEUP	0x01
+#define PRINTK_PENDING_OUTPUT	0x02
 
 static DEFINE_PER_CPU(int, printk_pending);
 
@@ -3745,14 +3475,10 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
 	int pending = this_cpu_xchg(printk_pending, 0);
 
-	if (pending & PRINTK_PENDING_DIRECT_OUTPUT) {
-		printk_prefer_direct_enter();
-
+	if (pending & PRINTK_PENDING_OUTPUT) {
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
-
-		printk_prefer_direct_exit();
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
@@ -3777,11 +3503,10 @@ static void __wake_up_klogd(int val)
 	 * prepare_to_wait_event(), which is called after ___wait_event() adds
 	 * the waiter but before it has checked the wait condition.
 	 *
-	 * This pairs with devkmsg_read:A, syslog_print:A, and
-	 * printk_kthread_func:A.
+	 * This pairs with devkmsg_read:A and syslog_print:A.
 	 */
 	if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
-	    (val & PRINTK_PENDING_DIRECT_OUTPUT)) {
+	    (val & PRINTK_PENDING_OUTPUT)) {
 		this_cpu_or(printk_pending, val);
 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
@@ -3799,17 +3524,7 @@ void defer_console_output(void)
 	 * New messages may have been added directly to the ringbuffer
 	 * using vprintk_store(), so wake any waiters as well.
 	 */
-	int val = PRINTK_PENDING_WAKEUP;
-
-	/*
-	 * Make sure that some context will print the messages when direct
-	 * printing is allowed. This happens in situations when the kthreads
-	 * may not be as reliable or perhaps unusable.
-	 */
-	if (allow_direct_printing())
-		val |= PRINTK_PENDING_DIRECT_OUTPUT;
-
-	__wake_up_klogd(val);
+	__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
 }
 
 void printk_trigger_flush(void)
-- 
cgit v1.2.3


From 07a22b61946f0b80065b0ddcc703b715f84355f5 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 23 Jun 2022 16:51:57 +0200
Subject: Revert "printk: add functions to prefer direct printing"

This reverts commit 2bb2b7b57f81255c13f4395ea911d6bdc70c9fe2.

The testing of 5.19 release candidates revealed missing synchronization
between early and regular console functionality.

It would be possible to start the console kthreads later as a workaround.
But it is clear that console lock serialized console drivers between
each other. It opens a big area of possible problems that were not
considered by people involved in the development and review.

printk() is crucial for debugging kernel issues and console output is
very important part of it. The number of consoles is huge and a proper
review would take some time. As a result it need to be reverted for 5.19.

Link: https://lore.kernel.org/r/YrBdjVwBOVgLfHyb@alley
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220623145157.21938-7-pmladek@suse.com
---
 kernel/hung_task.c      | 11 +----------
 kernel/panic.c          |  4 ----
 kernel/printk/printk.c  | 28 ----------------------------
 kernel/rcu/tree_stall.h |  2 --
 kernel/reboot.c         | 14 +-------------
 kernel/watchdog.c       |  4 ----
 kernel/watchdog_hld.c   |  4 ----
 7 files changed, 2 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 02a65d554340..52501e5f7655 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,8 +127,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	 * complain:
 	 */
 	if (sysctl_hung_task_warnings) {
-		printk_prefer_direct_enter();
-
 		if (sysctl_hung_task_warnings > 0)
 			sysctl_hung_task_warnings--;
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -144,8 +142,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 
 		if (sysctl_hung_task_all_cpu_backtrace)
 			hung_task_show_all_bt = true;
-
-		printk_prefer_direct_exit();
 	}
 
 	touch_nmi_watchdog();
@@ -208,17 +204,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	}
  unlock:
 	rcu_read_unlock();
-	if (hung_task_show_lock) {
-		printk_prefer_direct_enter();
+	if (hung_task_show_lock)
 		debug_show_all_locks();
-		printk_prefer_direct_exit();
-	}
 
 	if (hung_task_show_all_bt) {
 		hung_task_show_all_bt = false;
-		printk_prefer_direct_enter();
 		trigger_all_cpu_backtrace();
-		printk_prefer_direct_exit();
 	}
 
 	if (hung_task_call_panic)
diff --git a/kernel/panic.c b/kernel/panic.c
index 6737b2332275..8355b19676f8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -579,8 +579,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	printk_prefer_direct_enter();
-
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
 			raw_smp_processor_id(), current->pid, file, line,
@@ -610,8 +608,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 	/* Just a warning, don't kill lockdep. */
 	add_taint(taint, LOCKDEP_STILL_OK);
-
-	printk_prefer_direct_exit();
 }
 
 #ifndef __WARN_FLAGS
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5a6273e56b4e..b49c6ff6dca0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -362,34 +362,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 static DEFINE_MUTEX(syslog_lock);
 
 #ifdef CONFIG_PRINTK
-static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
-
-/**
- * printk_prefer_direct_enter - cause printk() calls to attempt direct
- *                              printing to all enabled consoles
- *
- * Since it is not possible to call into the console printing code from any
- * context, there is no guarantee that direct printing will occur.
- *
- * This globally effects all printk() callers.
- *
- * Context: Any context.
- */
-void printk_prefer_direct_enter(void)
-{
-	atomic_inc(&printk_prefer_direct);
-}
-
-/**
- * printk_prefer_direct_exit - restore printk() behavior
- *
- * Context: Any context.
- */
-void printk_prefer_direct_exit(void)
-{
-	WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
-}
-
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 4995c078cff9..a001e1e7a992 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -647,7 +647,6 @@ static void print_cpu_stall(unsigned long gps)
 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
-	printk_prefer_direct_enter();
 	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
 	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
 	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
@@ -685,7 +684,6 @@ static void print_cpu_stall(unsigned long gps)
 	 */
 	set_tsk_need_resched(current);
 	set_preempt_need_resched();
-	printk_prefer_direct_exit();
 }
 
 static void check_cpu_stall(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 4177645e74d6..6bcc5d6a6572 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -447,11 +447,9 @@ static int __orderly_reboot(void)
 	ret = run_cmd(reboot_cmd);
 
 	if (ret) {
-		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly reboot: forcing the issue\n");
 		emergency_sync();
 		kernel_restart(NULL);
-		printk_prefer_direct_exit();
 	}
 
 	return ret;
@@ -464,7 +462,6 @@ static int __orderly_poweroff(bool force)
 	ret = run_cmd(poweroff_cmd);
 
 	if (ret && force) {
-		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
 
 		/*
@@ -474,7 +471,6 @@ static int __orderly_poweroff(bool force)
 		 */
 		emergency_sync();
 		kernel_power_off();
-		printk_prefer_direct_exit();
 	}
 
 	return ret;
@@ -532,8 +528,6 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
  */
 static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 {
-	printk_prefer_direct_enter();
-
 	/*
 	 * We have reached here after the emergency shutdown waiting period has
 	 * expired. This means orderly_poweroff has not been able to shut off
@@ -550,8 +544,6 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 	 */
 	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
 	emergency_restart();
-
-	printk_prefer_direct_exit();
 }
 
 static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
@@ -590,13 +582,11 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	static atomic_t allow_proceed = ATOMIC_INIT(1);
 
-	printk_prefer_direct_enter();
-
 	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);
 
 	/* Shutdown should be initiated only once. */
 	if (!atomic_dec_and_test(&allow_proceed))
-		goto out;
+		return;
 
 	/*
 	 * Queue a backup emergency shutdown in the event of
@@ -604,8 +594,6 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 	 */
 	hw_failure_emergency_poweroff(ms_until_forced);
 	orderly_poweroff(true);
-out:
-	printk_prefer_direct_exit();
 }
 EXPORT_SYMBOL_GPL(hw_protection_shutdown);
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 40024e03d422..9166220457bc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -424,8 +424,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		/* Start period for the next softlockup warning. */
 		update_report_ts();
 
-		printk_prefer_direct_enter();
-
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -444,8 +442,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
-
-		printk_prefer_direct_exit();
 	}
 
 	return HRTIMER_RESTART;
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 701f35f0e2d4..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,8 +135,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
 
-		printk_prefer_direct_enter();
-
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
 			 this_cpu);
 		print_modules();
@@ -157,8 +155,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
 
-		printk_prefer_direct_exit();
-
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
 	}
-- 
cgit v1.2.3


From 2390095113e98fc52fffe35c5206d30d9efe3f78 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Mon, 27 Jun 2022 12:22:09 +0900
Subject: tick/nohz: unexport __init-annotated tick_nohz_full_setup()

EXPORT_SYMBOL and __init is a bad combination because the .init.text
section is freed up after the initialization. Hence, modules cannot
use symbols annotated __init. The access to a freed symbol may end up
with kernel panic.

modpost used to detect it, but it had been broken for a decade.

Commit 28438794aba4 ("modpost: fix section mismatch check for exported
init/exit sections") fixed it so modpost started to warn it again, then
this showed up:

    MODPOST vmlinux.symvers
  WARNING: modpost: vmlinux.o(___ksymtab_gpl+tick_nohz_full_setup+0x0): Section mismatch in reference from the variable __ksymtab_tick_nohz_full_setup to the function .init.text:tick_nohz_full_setup()
  The symbol tick_nohz_full_setup is exported and annotated __init
  Fix this by removing the __init annotation of tick_nohz_full_setup or drop the export.

Drop the export because tick_nohz_full_setup() is only called from the
built-in code in kernel/sched/isolation.c.

Fixes: ae9e557b5be2 ("time: Export tick start/stop functions for rcutorture")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/tick-sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58a11f859ac7..30049580cd62 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -526,7 +526,6 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 	cpumask_copy(tick_nohz_full_mask, cpumask);
 	tick_nohz_full_running = true;
 }
-EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
-- 
cgit v1.2.3