From 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 Mon Sep 17 00:00:00 2001
From: Jules Maselbas <jules.maselbas@arm.com>
Date: Thu, 29 Mar 2018 15:43:01 +0100
Subject: sched/cpufreq/schedutil: Fix error path mutex unlock

This patch prevents the 'global_tunables_lock' mutex from being
unlocked before being locked.  This mutex is not locked if the
sugov_kthread_create() function fails.

Signed-off-by: Jules Maselbas <jules.maselbas@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chris Redpath <chris.redpath@arm.com>
Cc: Dietmar Eggermann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Stephen Kyle <stephen.kyle@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: nd@arm.com
Link: http://lkml.kernel.org/r/20180329144301.38419-1-jules.maselbas@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpufreq_schedutil.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..617c6741c525 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -625,10 +625,9 @@ fail:
 
 stop_kthread:
 	sugov_kthread_stop(sg_policy);
-
-free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
+free_sg_policy:
 	sugov_policy_free(sg_policy);
 
 disable_fast_switch:
-- 
cgit v1.2.3


From d29a20645d5e929aa7e8616f28e5d8e1c49263ec Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 2 Apr 2018 09:49:54 -0700
Subject: sched/rt: Fix rq->clock_update_flags < RQCF_ACT_SKIP warning

While running rt-tests' pi_stress program I got the following splat:

  rq->clock_update_flags < RQCF_ACT_SKIP
  WARNING: CPU: 27 PID: 0 at kernel/sched/sched.h:960 assert_clock_updated.isra.38.part.39+0x13/0x20

  [...]

  <IRQ>
  enqueue_top_rt_rq+0xf4/0x150
  ? cpufreq_dbs_governor_start+0x170/0x170
  sched_rt_rq_enqueue+0x65/0x80
  sched_rt_period_timer+0x156/0x360
  ? sched_rt_rq_enqueue+0x80/0x80
  __hrtimer_run_queues+0xfa/0x260
  hrtimer_interrupt+0xcb/0x220
  smp_apic_timer_interrupt+0x62/0x120
  apic_timer_interrupt+0xf/0x20
  </IRQ>

  [...]

  do_idle+0x183/0x1e0
  cpu_startup_entry+0x5f/0x70
  start_secondary+0x192/0x1d0
  secondary_startup_64+0xa5/0xb0

We can get rid of it be the "traditional" means of adding an
update_rq_clock() call after acquiring the rq->lock in
do_sched_rt_period_timer().

The case for the RT task throttling (which this workload also hits)
can be ignored in that the skip_update call is actually bogus and
quite the contrary (the request bits are removed/reverted).

By setting RQCF_UPDATED we really don't care if the skip is happening
or not and will therefore make the assert_clock_updated() check happy.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: linux-kernel@vger.kernel.org
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20180402164954.16255-1-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 86b77987435e..ad13e6242481 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			continue;
 
 		raw_spin_lock(&rq->lock);
+		update_rq_clock(rq);
+
 		if (rt_rq->rt_time) {
 			u64 runtime;
 
-- 
cgit v1.2.3


From adcc8da8859bee9548bb6d323b1e8de8a7252acd Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 4 Apr 2018 09:15:39 -0700
Subject: sched/core: Simplify helpers for rq clock update skip requests

By renaming the functions we can get rid of the skip parameter
and have better code redability. It makes zero sense to have
things such as:

  rq_clock_skip_update(rq, false)

When the skip request is in fact not going to happen. Ever. Rename
things such that we end up with:

  rq_clock_skip_update(rq)
  rq_clock_cancel_skipupdate(rq)

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: matt@codeblueprint.co.uk
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20180404161539.nhadkff2aats74jh@linux-n805
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     |  2 +-
 kernel/sched/deadline.c |  2 +-
 kernel/sched/fair.c     |  2 +-
 kernel/sched/rt.c       |  2 +-
 kernel/sched/sched.h    | 17 ++++++++++++-----
 5 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28b68995a417..550a07f648b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -874,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * this case, we can save a useless back to back clock update.
 	 */
 	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-		rq_clock_skip_update(rq, true);
+		rq_clock_skip_update(rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d1c7bf7c7e5b..e7b3008b85bb 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq)
 	 * so we don't do microscopic update in schedule()
 	 * and double the fastpath cost.
 	 */
-	rq_clock_skip_update(rq, true);
+	rq_clock_skip_update(rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0951d1c58d2f..54dc31e7ab9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7089,7 +7089,7 @@ static void yield_task_fair(struct rq *rq)
 		 * so we don't do microscopic update in schedule()
 		 * and double the fastpath cost.
 		 */
-		rq_clock_skip_update(rq, true);
+		rq_clock_skip_update(rq);
 	}
 
 	set_skip_buddy(se);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ad13e6242481..7aef6b4e885a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -861,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 				 * 'runtime'.
 				 */
 				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-					rq_clock_skip_update(rq, false);
+					rq_clock_cancel_skipupdate(rq);
 			}
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c3deaee7a7a2..15750c222ca2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq)
 	return rq->clock_task;
 }
 
-static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+static inline void rq_clock_skip_update(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
-	if (skip)
-		rq->clock_update_flags |= RQCF_REQ_SKIP;
-	else
-		rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+	rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
+
+/*
+ * See rt task throttoling, which is the only time a skip
+ * request is cancelled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	rq->clock_update_flags &= ~RQCF_REQ_SKIP;
 }
 
 struct rq_flags {
-- 
cgit v1.2.3


From 7303e30ec1d8fb5ca1f07c92d069241c32b2ee1b Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Thu, 5 Apr 2018 11:53:03 +0200
Subject: syscalls/core: Prepare CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y for compat
 syscalls

It may be useful for an architecture to override the definitions of the
COMPAT_SYSCALL_DEFINE0() and __COMPAT_SYSCALL_DEFINEx() macros in
<linux/compat.h>, in particular to use a different calling convention
for syscalls. This patch provides a mechanism to do so, based on the
previously introduced CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled,
<asm/sycall_wrapper.h> is included in <linux/compat.h> and may be used
to define the macros mentioned above. Moreover, as the syscall calling
convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set,
the compat syscall function prototypes in <linux/compat.h> are #ifndef'd
out in that case.

As some of the syscalls and/or compat syscalls may not be present,
the COND_SYSCALL() and COND_SYSCALL_COMPAT() macros in kernel/sys_ni.c
as well as the SYS_NI() and COMPAT_SYS_NI() macros in
kernel/time/posix-stubs.c can be re-defined in <asm/syscall_wrapper.h> iff
CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180405095307.3730-5-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sys_ni.c           | 10 ++++++++++
 kernel/time/posix-stubs.c | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6cafc008f6db..9791364925dc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -5,6 +5,11 @@
 
 #include <asm/unistd.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */
+#include <asm/syscall_wrapper.h>
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
 /*  we can't #include <linux/syscalls.h> here,
     but tell gcc to not warn with -Wmissing-prototypes  */
 asmlinkage long sys_ni_syscall(void);
@@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void)
 	return -ENOSYS;
 }
 
+#ifndef COND_SYSCALL
 #define COND_SYSCALL(name) cond_syscall(sys_##name)
+#endif /* COND_SYSCALL */
+
+#ifndef COND_SYSCALL_COMPAT
 #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
+#endif /* COND_SYSCALL_COMPAT */
 
 /*
  * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index b258bee13b02..69a937c3cd81 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -19,6 +19,11 @@
 #include <linux/posix-timers.h>
 #include <linux/compat.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override SYS_NI and COMPAT_SYS_NI */
+#include <asm/syscall_wrapper.h>
+#endif
+
 asmlinkage long sys_ni_posix_timers(void)
 {
 	pr_err_once("process %d (%s) attempted a POSIX timer syscall "
@@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void)
 	return -ENOSYS;
 }
 
+#ifndef SYS_NI
 #define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+#endif
+
+#ifndef COMPAT_SYS_NI
 #define COMPAT_SYS_NI(name)  SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
+#endif
 
 SYS_NI(timer_create);
 SYS_NI(timer_gettime);
-- 
cgit v1.2.3


From 0211e12dd0a5385ecffd3557bc570dbad7fcf245 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Apr 2018 12:40:07 +0200
Subject: genirq/affinity: Don't return with empty affinity masks on error

When the allocation of node_to_possible_cpumask fails, then
irq_create_affinity_masks() returns with a pointer to the empty affinity
masks array, which will cause malfunction.

Reorder the allocations so the masks array allocation comes last and every
failure path returns NULL.

Fixes: 9a0ef98e186d ("genirq/affinity: Assign vectors to all present CPUs")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ming Lei <ming.lei@redhat.com>
---
 kernel/irq/affinity.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a37a3b4b6342..e0665549af59 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
 	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
-	struct cpumask *masks;
+	struct cpumask *masks = NULL;
 	cpumask_var_t nmsk, *node_to_possible_cpumask;
 
 	/*
@@ -121,13 +121,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
 
-	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
-	if (!masks)
-		goto out;
-
 	node_to_possible_cpumask = alloc_node_to_possible_cpumask();
 	if (!node_to_possible_cpumask)
-		goto out;
+		goto outcpumsk;
+
+	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto outnodemsk;
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
@@ -192,8 +192,9 @@ done:
 	/* Fill out vectors at the end that don't need affinity */
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
+outnodemsk:
 	free_node_to_possible_cpumask(node_to_possible_cpumask);
-out:
+outcpumsk:
 	free_cpumask_var(nmsk);
 	return masks;
 }
-- 
cgit v1.2.3


From 47778f33dcba7feb92031643b37e477892f82b62 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 8 Mar 2018 18:53:55 +0800
Subject: genirq/affinity: Rename *node_to_possible_cpumask as *node_to_cpumask

The following patches will introduce two stage irq spreading for improving
irq spread on all possible CPUs.

No functional change.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Link: https://lkml.kernel.org/r/20180308105358.1506-2-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e0665549af59..272c968d9ef1 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
 	}
 }
 
-static cpumask_var_t *alloc_node_to_possible_cpumask(void)
+static cpumask_var_t *alloc_node_to_cpumask(void)
 {
 	cpumask_var_t *masks;
 	int node;
@@ -62,7 +62,7 @@ out_unwind:
 	return NULL;
 }
 
-static void free_node_to_possible_cpumask(cpumask_var_t *masks)
+static void free_node_to_cpumask(cpumask_var_t *masks)
 {
 	int node;
 
@@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks)
 	kfree(masks);
 }
 
-static void build_node_to_possible_cpumask(cpumask_var_t *masks)
+static void build_node_to_cpumask(cpumask_var_t *masks)
 {
 	int cpu;
 
@@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks)
 		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
 
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 				const struct cpumask *mask, nodemask_t *nodemsk)
 {
 	int n, nodes = 0;
 
 	/* Calculate the number of nodes in the supplied affinity mask */
 	for_each_node(n) {
-		if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
+		if (cpumask_intersects(mask, node_to_cpumask[n])) {
 			node_set(n, *nodemsk);
 			nodes++;
 		}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct cpumask *masks = NULL;
-	cpumask_var_t nmsk, *node_to_possible_cpumask;
+	cpumask_var_t nmsk, *node_to_cpumask;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -121,8 +121,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
 
-	node_to_possible_cpumask = alloc_node_to_possible_cpumask();
-	if (!node_to_possible_cpumask)
+	node_to_cpumask = alloc_node_to_cpumask();
+	if (!node_to_cpumask)
 		goto outcpumsk;
 
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Stabilize the cpumasks */
 	get_online_cpus();
-	build_node_to_possible_cpumask(node_to_possible_cpumask);
-	nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
+	build_node_to_cpumask(node_to_cpumask);
+	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask,
 				     &nodemsk);
 
 	/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (affv <= nodes) {
 		for_each_node_mask(n, nodemsk) {
 			cpumask_copy(masks + curvec,
-				     node_to_possible_cpumask[n]);
+				     node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				break;
 		}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
+		cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]);
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
@@ -193,7 +193,7 @@ done:
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 outnodemsk:
-	free_node_to_possible_cpumask(node_to_possible_cpumask);
+	free_node_to_cpumask(node_to_cpumask);
 outcpumsk:
 	free_cpumask_var(nmsk);
 	return masks;
-- 
cgit v1.2.3


From b3e6aaa8d94d618e685c4df08bef991a4fb43923 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 8 Mar 2018 18:53:56 +0800
Subject: genirq/affinity: Move actual irq vector spreading into a helper
 function

No functional change, just prepare for converting to 2-stage irq vector
spreading.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Link: https://lkml.kernel.org/r/20180308105358.1506-3-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 97 +++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 272c968d9ef1..a9c36904500c 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 	return nodes;
 }
 
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs:	The total number of vectors
- * @affd:	Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+				    cpumask_var_t *node_to_cpumask,
+				    const struct cpumask *cpu_mask,
+				    struct cpumask *nmsk,
+				    struct cpumask *masks)
 {
-	int n, nodes, cpus_per_vec, extra_vecs, curvec;
 	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
 	int last_affv = affv + affd->pre_vectors;
+	int curvec = affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
-	struct cpumask *masks = NULL;
-	cpumask_var_t nmsk, *node_to_cpumask;
+	int n, nodes, cpus_per_vec, extra_vecs;
 
-	/*
-	 * If there aren't any vectors left after applying the pre/post
-	 * vectors don't bother with assigning affinity.
-	 */
-	if (!affv)
-		return NULL;
-
-	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-		return NULL;
-
-	node_to_cpumask = alloc_node_to_cpumask();
-	if (!node_to_cpumask)
-		goto outcpumsk;
-
-	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
-	if (!masks)
-		goto outnodemsk;
-
-	/* Fill out vectors at the beginning that don't need affinity */
-	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
-
-	/* Stabilize the cpumasks */
-	get_online_cpus();
-	build_node_to_cpumask(node_to_cpumask);
-	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask,
-				     &nodemsk);
+	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
 
 	/*
 	 * If the number of nodes in the mask is greater than or equal the
@@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 			if (++curvec == last_affv)
 				break;
 		}
-		goto done;
+		goto out;
 	}
 
 	for_each_node_mask(n, nodemsk) {
@@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]);
+		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
@@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		--nodes;
 	}
 
-done:
+out:
+	return curvec - affd->pre_vectors;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs:	The total number of vectors
+ * @affd:	Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+	cpumask_var_t nmsk, *node_to_cpumask;
+	struct cpumask *masks = NULL;
+	int curvec;
+
+	/*
+	 * If there aren't any vectors left after applying the pre/post
+	 * vectors don't bother with assigning affinity.
+	 */
+	if (nvecs == affd->pre_vectors + affd->post_vectors)
+		return NULL;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		return NULL;
+
+	node_to_cpumask = alloc_node_to_cpumask();
+	if (!node_to_cpumask)
+		goto outcpumsk;
+
+	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto outnodemsk;
+
+	/* Fill out vectors at the beginning that don't need affinity */
+	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+		cpumask_copy(masks + curvec, irq_default_affinity);
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	build_node_to_cpumask(node_to_cpumask);
+	curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask,
+					   cpu_possible_mask, nmsk, masks);
 	put_online_cpus();
 
 	/* Fill out vectors at the end that don't need affinity */
-- 
cgit v1.2.3


From 1a2d0914e23aab386f5d5acb689777e24151c2c8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 8 Mar 2018 18:53:57 +0800
Subject: genirq/affinity: Allow irq spreading from a given starting point

To support two stage irq vector spreading, it's required to add a starting
point to the spreading function. No functional change, just preparatory
work for the actual two stage change.

[ tglx: Renamed variables, tidied up the code and massaged changelog ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Laurence Oberman <loberman@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Link: https://lkml.kernel.org/r/20180308105358.1506-4-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a9c36904500c..213695a27ddb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 	return nodes;
 }
 
-static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+				    int startvec, int numvecs,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
 				    struct cpumask *nmsk,
 				    struct cpumask *masks)
 {
-	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
-	int last_affv = affv + affd->pre_vectors;
-	int curvec = affd->pre_vectors;
+	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+	int last_affv = affd->pre_vectors + numvecs;
+	int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
-	int n, nodes, cpus_per_vec, extra_vecs;
 
 	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
 
@@ -112,12 +112,13 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
 	 * If the number of nodes in the mask is greater than or equal the
 	 * number of vectors we just spread the vectors across the nodes.
 	 */
-	if (affv <= nodes) {
+	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_copy(masks + curvec,
-				     node_to_cpumask[n]);
-			if (++curvec == last_affv)
+			cpumask_copy(masks + curvec, node_to_cpumask[n]);
+			if (++done == numvecs)
 				break;
+			if (++curvec == last_affv)
+				curvec = affd->pre_vectors;
 		}
 		goto out;
 	}
@@ -126,7 +127,7 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
 		int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
-		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
+		vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
 		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
@@ -150,13 +151,16 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd,
 			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
 		}
 
-		if (curvec >= last_affv)
+		done += v;
+		if (done >= numvecs)
 			break;
+		if (curvec >= last_affv)
+			curvec = affd->pre_vectors;
 		--nodes;
 	}
 
 out:
-	return curvec - affd->pre_vectors;
+	return done;
 }
 
 /**
@@ -169,9 +173,9 @@ out:
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
+	int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	cpumask_var_t nmsk, *node_to_cpumask;
 	struct cpumask *masks = NULL;
-	int curvec;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -198,8 +202,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	/* Stabilize the cpumasks */
 	get_online_cpus();
 	build_node_to_cpumask(node_to_cpumask);
-	curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask,
-					   cpu_possible_mask, nmsk, masks);
+	curvec += irq_build_affinity_masks(affd, curvec, affvecs,
+					   node_to_cpumask, cpu_possible_mask,
+					   nmsk, masks);
 	put_online_cpus();
 
 	/* Fill out vectors at the end that don't need affinity */
-- 
cgit v1.2.3


From d3056812e7dfe6bf4f8ad9e397a9116dd5d32d15 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 8 Mar 2018 18:53:58 +0800
Subject: genirq/affinity: Spread irq vectors among present CPUs as far as
 possible

Commit 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
tried to spread the interrupts accross all possible CPUs to make sure that
in case of phsyical hotplug (e.g. virtualization) the CPUs which get
plugged in after the device was initialized are targeted by a hardware
queue and the corresponding interrupt.

This has a downside in cases where the ACPI tables claim that there are
more possible CPUs than present CPUs and the number of interrupts to spread
out is smaller than the number of possible CPUs. These bogus ACPI tables
are unfortunately not uncommon.

In such a case the vector spreading algorithm assigns interrupts to CPUs
which can never be utilized and as a consequence these interrupts are
unused instead of being mapped to present CPUs. As a result the performance
of the device is suboptimal.

To fix this spread the interrupt vectors in two stages:

 1) Spread as many interrupts as possible among the present CPUs

 2) Spread the remaining vectors among non present CPUs

On a 8 core system, where CPU 0-3 are present and CPU 4-7 are not present,
for a device with 4 queues the resulting interrupt affinity is:

  1) Before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
	irq 39, cpu list 0
	irq 40, cpu list 1
	irq 41, cpu list 2
	irq 42, cpu list 3

  2) With 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs")
	irq 39, cpu list 0-2
	irq 40, cpu list 3-4,6
	irq 41, cpu list 5
	irq 42, cpu list 7

  3) With the refined vector spread applied:
	irq 39, cpu list 0,4
	irq 40, cpu list 1,6
	irq 41, cpu list 2,5
	irq 42, cpu list 3,7

On a 8 core system, where all CPUs are present the resulting interrupt
affinity for the 4 queues is:

	irq 39, cpu list 0,1
	irq 40, cpu list 2,3
	irq 41, cpu list 4,5
	irq 42, cpu list 6,7

This is independent of the number of CPUs which are online at the point of
initialization because in such a system the offline CPUs can be easily
onlined afterwards, while in non-present CPUs need to be plugged physically
or virtually which requires external interaction.

The downside of this approach is that in case of physical hotplug the
interrupt vector spreading might be suboptimal when CPUs 4-7 are physically
plugged. Suboptimal from a NUMA point of view and due to the single target
nature of interrupt affinities the later plugged CPUs might not be targeted
by interrupts at all.

Though, physical hotplug systems are not the common case while the broken
ACPI table disease is wide spread. So it's preferred to have as many
interrupts as possible utilized at the point where the device is
initialized.

Block multi-queue devices like NVME create a hardware queue per possible
CPU, so the goal of commit 84676c1f21 to assign one interrupt vector per
possible CPU is still achieved even with physical/virtual hotplug.

[ tglx: Changed from online to present CPUs for the first spreading stage,
  	renamed variables for readability sake, added comments and massaged
  	changelog ]

Reported-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Link: https://lkml.kernel.org/r/20180308105358.1506-5-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 213695a27ddb..f4f29b9d90ee 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 
+	if (!cpumask_weight(cpu_mask))
+		return 0;
+
 	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
 
 	/*
@@ -173,8 +176,9 @@ out:
 struct cpumask *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
-	int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-	cpumask_var_t nmsk, *node_to_cpumask;
+	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+	int curvec, usedvecs;
+	cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
 	struct cpumask *masks = NULL;
 
 	/*
@@ -187,9 +191,12 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
 
+	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+		goto outcpumsk;
+
 	node_to_cpumask = alloc_node_to_cpumask();
 	if (!node_to_cpumask)
-		goto outcpumsk;
+		goto outnpresmsk;
 
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
@@ -202,16 +209,40 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	/* Stabilize the cpumasks */
 	get_online_cpus();
 	build_node_to_cpumask(node_to_cpumask);
-	curvec += irq_build_affinity_masks(affd, curvec, affvecs,
-					   node_to_cpumask, cpu_possible_mask,
-					   nmsk, masks);
+
+	/* Spread on present CPUs starting from affd->pre_vectors */
+	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
+					    node_to_cpumask, cpu_present_mask,
+					    nmsk, masks);
+
+	/*
+	 * Spread on non present CPUs starting from the next vector to be
+	 * handled. If the spreading of present CPUs already exhausted the
+	 * vector space, assign the non present CPUs to the already spread
+	 * out vectors.
+	 */
+	if (usedvecs >= affvecs)
+		curvec = affd->pre_vectors;
+	else
+		curvec = affd->pre_vectors + usedvecs;
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+	usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
+					     node_to_cpumask, npresmsk,
+					     nmsk, masks);
 	put_online_cpus();
 
 	/* Fill out vectors at the end that don't need affinity */
+	if (usedvecs >= affvecs)
+		curvec = affd->pre_vectors + affvecs;
+	else
+		curvec = affd->pre_vectors + usedvecs;
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
+
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
+outnpresmsk:
+	free_cpumask_var(npresmsk);
 outcpumsk:
 	free_cpumask_var(nmsk);
 	return masks;
-- 
cgit v1.2.3


From 621b6d2ea297d0fb6030452c5bcd221f12165fcf Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Mon, 9 Apr 2018 19:03:46 +0900
Subject: perf/core: Fix use-after-free in uprobe_perf_close()

A use-after-free bug was caught by KASAN while running usdt related
code (BCC project. bcc/tests/python/test_usdt2.py):

	==================================================================
	BUG: KASAN: use-after-free in uprobe_perf_close+0x222/0x3b0
	Read of size 4 at addr ffff880384f9b4a4 by task test_usdt2.py/870

	CPU: 4 PID: 870 Comm: test_usdt2.py Tainted: G        W         4.16.0-next-20180409 #215
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
	Call Trace:
	 dump_stack+0xc7/0x15b
	 ? show_regs_print_info+0x5/0x5
	 ? printk+0x9c/0xc3
	 ? kmsg_dump_rewind_nolock+0x6e/0x6e
	 ? uprobe_perf_close+0x222/0x3b0
	 print_address_description+0x83/0x3a0
	 ? uprobe_perf_close+0x222/0x3b0
	 kasan_report+0x1dd/0x460
	 ? uprobe_perf_close+0x222/0x3b0
	 uprobe_perf_close+0x222/0x3b0
	 ? probes_open+0x180/0x180
	 ? free_filters_list+0x290/0x290
	 trace_uprobe_register+0x1bb/0x500
	 ? perf_event_attach_bpf_prog+0x310/0x310
	 ? probe_event_disable+0x4e0/0x4e0
	 perf_uprobe_destroy+0x63/0xd0
	 _free_event+0x2bc/0xbd0
	 ? lockdep_rcu_suspicious+0x100/0x100
	 ? ring_buffer_attach+0x550/0x550
	 ? kvm_sched_clock_read+0x1a/0x30
	 ? perf_event_release_kernel+0x3e4/0xc00
	 ? __mutex_unlock_slowpath+0x12e/0x540
	 ? wait_for_completion+0x430/0x430
	 ? lock_downgrade+0x3c0/0x3c0
	 ? lock_release+0x980/0x980
	 ? do_raw_spin_trylock+0x118/0x150
	 ? do_raw_spin_unlock+0x121/0x210
	 ? do_raw_spin_trylock+0x150/0x150
	 perf_event_release_kernel+0x5d4/0xc00
	 ? put_event+0x30/0x30
	 ? fsnotify+0xd2d/0xea0
	 ? sched_clock_cpu+0x18/0x1a0
	 ? __fsnotify_update_child_dentry_flags.part.0+0x1b0/0x1b0
	 ? pvclock_clocksource_read+0x152/0x2b0
	 ? pvclock_read_flags+0x80/0x80
	 ? kvm_sched_clock_read+0x1a/0x30
	 ? sched_clock_cpu+0x18/0x1a0
	 ? pvclock_clocksource_read+0x152/0x2b0
	 ? locks_remove_file+0xec/0x470
	 ? pvclock_read_flags+0x80/0x80
	 ? fcntl_setlk+0x880/0x880
	 ? ima_file_free+0x8d/0x390
	 ? lockdep_rcu_suspicious+0x100/0x100
	 ? ima_file_check+0x110/0x110
	 ? fsnotify+0xea0/0xea0
	 ? kvm_sched_clock_read+0x1a/0x30
	 ? rcu_note_context_switch+0x600/0x600
	 perf_release+0x21/0x40
	 __fput+0x264/0x620
	 ? fput+0xf0/0xf0
	 ? do_raw_spin_unlock+0x121/0x210
	 ? do_raw_spin_trylock+0x150/0x150
	 ? SyS_fchdir+0x100/0x100
	 ? fsnotify+0xea0/0xea0
	 task_work_run+0x14b/0x1e0
	 ? task_work_cancel+0x1c0/0x1c0
	 ? copy_fd_bitmaps+0x150/0x150
	 ? vfs_read+0xe5/0x260
	 exit_to_usermode_loop+0x17b/0x1b0
	 ? trace_event_raw_event_sys_exit+0x1a0/0x1a0
	 do_syscall_64+0x3f6/0x490
	 ? syscall_return_slowpath+0x2c0/0x2c0
	 ? lockdep_sys_exit+0x1f/0xaa
	 ? syscall_return_slowpath+0x1a3/0x2c0
	 ? lockdep_sys_exit+0x1f/0xaa
	 ? prepare_exit_to_usermode+0x11c/0x1e0
	 ? enter_from_user_mode+0x30/0x30
	random: crng init done
	 ? __put_user_4+0x1c/0x30
	 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
	RIP: 0033:0x7f41d95f9340
	RSP: 002b:00007fffe71e4268 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
	RAX: 0000000000000000 RBX: 000000000000000d RCX: 00007f41d95f9340
	RDX: 0000000000000000 RSI: 0000000000002401 RDI: 000000000000000d
	RBP: 0000000000000000 R08: 00007f41ca8ff700 R09: 00007f41d996dd1f
	R10: 00007fffe71e41e0 R11: 0000000000000246 R12: 00007fffe71e4330
	R13: 0000000000000000 R14: fffffffffffffffc R15: 00007fffe71e4290

	Allocated by task 870:
	 kasan_kmalloc+0xa0/0xd0
	 kmem_cache_alloc_node+0x11a/0x430
	 copy_process.part.19+0x11a0/0x41c0
	 _do_fork+0x1be/0xa20
	 do_syscall_64+0x198/0x490
	 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

	Freed by task 0:
	 __kasan_slab_free+0x12e/0x180
	 kmem_cache_free+0x102/0x4d0
	 free_task+0xfe/0x160
	 __put_task_struct+0x189/0x290
	 delayed_put_task_struct+0x119/0x250
	 rcu_process_callbacks+0xa6c/0x1b60
	 __do_softirq+0x238/0x7ae

	The buggy address belongs to the object at ffff880384f9b480
	 which belongs to the cache task_struct of size 12928

It occurs because task_struct is freed before perf_event which refers
to the task and task flags are checked while teardown of the event.
perf_event_alloc() assigns task_struct to hw.target of perf_event,
but there is no reference counting for it.

As a fix we get_task_struct() in perf_event_alloc() at above mentioned
assignment and put_task_struct() in _free_event().

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 63b6da39bb38e8f1a1ef3180d32a39d6 ("perf: Fix perf_event_exit_task() race")
Link: http://lkml.kernel.org/r/20180409100346.6416-1-bhole_prashant_q7@lab.ntt.co.jp
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc1c330c6bd6..d7af82827373 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
+	if (event->hw.target)
+		put_task_struct(event->hw.target);
+
 	exclusive_event_destroy(event);
 	module_put(event->pmu->module);
 
@@ -9955,6 +9958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		 * and we cannot use the ctx information because we need the
 		 * pmu before we get a ctx.
 		 */
+		get_task_struct(task);
 		event->hw.target = task;
 	}
 
@@ -10070,6 +10074,8 @@ err_ns:
 		perf_detach_cgroup(event);
 	if (event->ns)
 		put_pid_ns(event->ns);
+	if (event->hw.target)
+		put_task_struct(event->hw.target);
 	kfree(event);
 
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From 5da13ab8b0dcaa984c45ae43edf5a4d148603d42 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 9 Apr 2018 21:16:54 +0900
Subject: perf/core: Fix perf_kprobe_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix error handling in perf_kprobe_init():

	==================================================================
	BUG: KASAN: slab-out-of-bounds in strlen+0x8e/0xa0 lib/string.c:482
	Read of size 1 at addr ffff88003f9cc5c0 by task syz-executor2/23095

	CPU: 0 PID: 23095 Comm: syz-executor2 Not tainted 4.16.0+ #24
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
	Call Trace:
	 __dump_stack lib/dump_stack.c:77 [inline]
	 dump_stack+0xca/0x13e lib/dump_stack.c:113
	 print_address_description+0x6e/0x2c0 mm/kasan/report.c:256
	 kasan_report_error mm/kasan/report.c:354 [inline]
	 kasan_report+0x256/0x380 mm/kasan/report.c:412
	 strlen+0x8e/0xa0 lib/string.c:482
	 kstrdup+0x21/0x70 mm/util.c:55
	 alloc_trace_kprobe+0xc8/0x930 kernel/trace/trace_kprobe.c:325
	 create_local_trace_kprobe+0x4f/0x3a0 kernel/trace/trace_kprobe.c:1438
	 perf_kprobe_init+0x149/0x1f0 kernel/trace/trace_event_perf.c:264
	 perf_kprobe_event_init+0xa8/0x120 kernel/events/core.c:8407
	 perf_try_init_event+0xcb/0x2a0 kernel/events/core.c:9719
	 perf_init_event kernel/events/core.c:9750 [inline]
	 perf_event_alloc+0x1367/0x1e20 kernel/events/core.c:10022
	 SYSC_perf_event_open+0x242/0x2330 kernel/events/core.c:10477
	 do_syscall_64+0x198/0x640 arch/x86/entry/common.c:287
	 entry_SYSCALL_64_after_hwframe+0x42/0xb7

Reported-by: 范龙飞 <long7573@126.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 2c416509b834..94600f1f7efa 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
 		ret = strncpy_from_user(
 			func, u64_to_user_ptr(p_event->attr.kprobe_func),
 			KSYM_NAME_LEN);
+		if (ret == KSYM_NAME_LEN)
+			ret = -E2BIG;
 		if (ret < 0)
 			goto out;
 
-- 
cgit v1.2.3


From 0eadcc7a7bc03e991d2da1cf88143fb7cc0342c1 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 9 Apr 2018 18:31:30 +0000
Subject: perf/core: Fix perf_uprobe_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similarly to the uprobe PMU fix in perf_kprobe_init(), fix error
handling in perf_uprobe_init() as well.

Reported-by: 范龙飞 <long7573@126.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 94600f1f7efa..c79193e598f5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -302,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
 		return -ENOMEM;
 	ret = strncpy_from_user(
 		path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+	if (ret == PATH_MAX)
+		return -E2BIG;
 	if (ret < 0)
 		goto out;
 	if (path[0] == '\0') {
-- 
cgit v1.2.3


From 50268a3d266ecfdd6c5873d62b2758d9732fc598 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 10 Apr 2018 21:20:08 +0900
Subject: tracing/uprobe_event: Fix strncpy corner case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix string fetch function to terminate with NUL.
It is OK to drop the rest of string.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: security@kernel.org
Cc: 范龙飞 <long7573@126.com>
Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_uprobe.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2014f4351ae0..0d450b40988e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 		return;
 
 	ret = strncpy_from_user(dst, src, maxlen);
+	if (ret == maxlen)
+		dst[--ret] = '\0';
 
 	if (ret < 0) {	/* Failed to fetch string */
 		((u8 *)get_rloc_data(dest))[0] = '\0';
-- 
cgit v1.2.3


From 0a4d0564f0fc377ad0ba66ce214b9b16324aea4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= <jeremy.lefaure@lse.epita.fr>
Date: Sun, 15 Oct 2017 21:22:49 -0400
Subject: tracing: Use ARRAY_SIZE() macro instead of open coding it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is useless to re-invent the ARRAY_SIZE macro so let's use it instead
of DATA_CNT.

Found with Coccinelle with the following semantic patch:
@r depends on (org || report)@
type T;
T[] E;
position p;
@@
(
 (sizeof(E)@p /sizeof(*E))
|
 (sizeof(E)@p /sizeof(E[...]))
|
 (sizeof(E)@p /sizeof(T))
)

Link: http://lkml.kernel.org/r/20171016012250.26453-1-jeremy.lefaure@lse.epita.fr

Signed-off-by: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
[ Removed useless include of kernel.h ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1bda4ec95e18..5eba1cec945c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2140,7 +2140,7 @@ static struct test_filter_data_t {
 #undef YES
 #undef NO
 
-#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+#define DATA_CNT ARRAY_SIZE(test_filter_data)
 
 static int test_pred_visited;
 
-- 
cgit v1.2.3


From f0a2aa5a2a406d0a57aa9b320ffaa5538672b6c5 Mon Sep 17 00:00:00 2001
From: Howard McLauchlan <hmclauchlan@fb.com>
Date: Tue, 10 Apr 2018 16:10:30 -0700
Subject: tracing/uprobe: Add support for overlayfs

uprobes cannot successfully attach to binaries located in a directory
mounted with overlayfs.

To verify, create directories for mounting overlayfs
(upper,lower,work,merge), move some binary into merge/ and use readelf
to obtain some known instruction of the binary. I used /bin/true and the
entry instruction(0x13b0):

	$ mount -t overlay overlay -o lowerdir=lower,upperdir=upper,workdir=work merge
	$ cd /sys/kernel/debug/tracing
	$ echo 'p:true_entry PATH_TO_MERGE/merge/true:0x13b0' > uprobe_events
	$ echo 1 > events/uprobes/true_entry/enable

This returns 'bash: echo: write error: Input/output error' and dmesg
tells us 'event trace: Could not enable event true_entry'

This change makes create_trace_uprobe() look for the real inode of a
dentry. In the case of normal filesystems, this simplifies to just
returning the inode. In the case of overlayfs(and similar fs) we will
obtain the underlying dentry and corresponding inode, upon which uprobes
can successfully register.

Running the example above with the patch applied, we can see that the
uprobe is enabled and will output to trace as expected.

Link: http://lkml.kernel.org/r/20180410231030.2720-1-hmclauchlan@fb.com

Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Howard McLauchlan <hmclauchlan@fb.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 268029ae1be6..8b86d76c55ee 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -446,7 +446,7 @@ static int create_trace_uprobe(int argc, char **argv)
 	if (ret)
 		goto fail_address_parse;
 
-	inode = igrab(d_inode(path.dentry));
+	inode = igrab(d_real_inode(path.dentry));
 	path_put(&path);
 
 	if (!inode || !S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3


From 18d45b11d96e6f9b3814960a1394083a3d6b7f74 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2018 13:57:55 +0530
Subject: trace_uprobe: Use %lx to display offset

tu->offset is unsigned long, not a pointer, thus %lx should
be used to print it, not the %px.

Link: http://lkml.kernel.org/r/20180315082756.9050-1-ravi.bangoria@linux.vnet.ibm.com

Cc: stable@vger.kernel.org
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: 0e4d819d0893 ("trace_uprobe: Display correct offset in uprobe_events")
Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8b86d76c55ee..d7d3c9237f64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 
 	/* Don't print "0x  (null)" when offset is 0 */
 	if (tu->offset) {
-		seq_printf(m, "0x%px", (void *)tu->offset);
+		seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset);
 	} else {
 		switch (sizeof(void *)) {
 		case 4:
-- 
cgit v1.2.3


From a64b2c01e67eff8b8d0d438507fd0346290697cf Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2018 13:57:56 +0530
Subject: trace_uprobe: Simplify probes_seq_show()

Simplify probes_seq_show() function. No change in output
before and after patch.

Link: http://lkml.kernel.org/r/20180315082756.9050-2-ravi.bangoria@linux.vnet.ibm.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d7d3c9237f64..21604754bb79 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -602,24 +602,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 
-	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
-			trace_event_name(&tu->tp.call));
-	seq_printf(m, " %s:", tu->filename);
-
-	/* Don't print "0x  (null)" when offset is 0 */
-	if (tu->offset) {
-		seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset);
-	} else {
-		switch (sizeof(void *)) {
-		case 4:
-			seq_printf(m, "0x00000000");
-			break;
-		case 8:
-		default:
-			seq_printf(m, "0x0000000000000000");
-			break;
-		}
-	}
+	seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system,
+			trace_event_name(&tu->tp.call), tu->filename,
+			(int)(sizeof(void *) * 2), tu->offset);
 
 	for (i = 0; i < tu->tp.nr_args; i++)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
-- 
cgit v1.2.3


From 0b3dec05dbbce023f4f25aba975b5d253c313ebb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 11 Apr 2018 10:59:46 -0400
Subject: tracing: Enforce passing in filter=NULL to create_filter()

There's some inconsistency with what to set the output parameter filterp
when passing to create_filter(..., struct event_filter **filterp).

Whatever filterp points to, should be NULL when calling this function. The
create_filter() calls create_filter_start() with a pointer to a local
"filter" variable that is set to NULL. The create_filter_start() has a
WARN_ON() if the passed in pointer isn't pointing to a value set to NULL.

Ideally, create_filter() should pass the filterp variable it received to
create_filter_start() and not hide it as with a local variable, this allowed
create_filter() to fail, and not update the passed in filter, and the caller
of create_filter() then tried to free filter, which was never initialized to
anything, causing memory corruption.

Link: http://lkml.kernel.org/r/00000000000032a0c30569916870@google.com

Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster")
Reported-by: syzbot+dadcc936587643d7f568@syzkaller.appspotmail.com
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 5eba1cec945c..9b4716bb8bb0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1704,18 +1704,16 @@ static int create_filter(struct trace_event_call *call,
 			 struct event_filter **filterp)
 {
 	struct filter_parse_error *pe = NULL;
-	struct event_filter *filter = NULL;
 	int err;
 
-	err = create_filter_start(filter_string, set_str, &pe, &filter);
+	err = create_filter_start(filter_string, set_str, &pe, filterp);
 	if (err)
 		return err;
 
-	err = process_preds(call, filter_string, filter, pe);
+	err = process_preds(call, filter_string, *filterp, pe);
 	if (err && set_str)
-		append_filter_err(pe, filter);
+		append_filter_err(pe, *filterp);
 
-	*filterp = filter;
 	return err;
 }
 
@@ -1739,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				char *filter_str, struct event_filter **filterp)
 {
-	struct event_filter *filter = NULL;
 	struct filter_parse_error *pe = NULL;
 	int err;
 
-	err = create_filter_start(filter_str, true, &pe, &filter);
+	err = create_filter_start(filter_str, true, &pe, filterp);
 	if (!err) {
 		err = process_system_preds(dir, tr, pe, filter_str);
 		if (!err) {
 			/* System filters just show a default message */
-			kfree(filter->filter_string);
-			filter->filter_string = NULL;
+			kfree((*filterp)->filter_string);
+			(*filterp)->filter_string = NULL;
 		} else {
-			append_filter_err(pe, filter);
+			append_filter_err(pe, *filterp);
 		}
 	}
 	create_filter_finish(pe);
 
-	*filterp = filter;
 	return err;
 }
 
@@ -1764,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 int apply_event_filter(struct trace_event_file *file, char *filter_string)
 {
 	struct trace_event_call *call = file->event_call;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	int err;
 
 	if (!strcmp(strstrip(filter_string), "0")) {
@@ -1817,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 {
 	struct event_subsystem *system = dir->subsystem;
 	struct trace_array *tr = dir->tr;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
@@ -2024,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 			      char *filter_str)
 {
 	int err;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	struct trace_event_call *call;
 
 	mutex_lock(&event_mutex);
-- 
cgit v1.2.3


From 32e6e967fb36bf77ed99221ae3ce1909f045d8f9 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 11 Apr 2018 18:02:37 +0000
Subject: perf/core: Need CAP_SYS_ADMIN to create k/uprobe with
 perf_event_open()

Non-root user cannot create kprobe or uprobe through the text-based
interface (kprobe_events, uprobe_events),so they should not be able
to create probes via perf_event_open() either.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Song Liu <songliubraving@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU")
Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU")
Link: http://lkml.kernel.org/r/C0B2EFB5-C403-4BDB-9046-C14B3EE66999@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7af82827373..2d5fe26551f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8400,6 +8400,10 @@ static int perf_kprobe_event_init(struct perf_event *event)
 
 	if (event->attr.type != perf_kprobe.type)
 		return -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * no branch sampling for probe events
 	 */
@@ -8437,6 +8441,10 @@ static int perf_uprobe_event_init(struct perf_event *event)
 
 	if (event->attr.type != perf_uprobe.type)
 		return -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * no branch sampling for probe events
 	 */
-- 
cgit v1.2.3


From 60bb83b81169820c691fbfa33a6a4aef32aa4b0b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 13 Apr 2018 15:35:13 -0700
Subject: resource: fix integer overflow at reallocation

We've got a bug report indicating a kernel panic at booting on an x86-32
system, and it turned out to be the invalid PCI resource assigned after
reallocation.  __find_resource() first aligns the resource start address
and resets the end address with start+size-1 accordingly, then checks
whether it's contained.  Here the end address may overflow the integer,
although resource_contains() still returns true because the function
validates only start and end address.  So this ends up with returning an
invalid resource (start > end).

There was already an attempt to cover such a problem in the commit
47ea91b4052d ("Resource: fix wrong resource window calculation"), but
this case is an overseen one.

This patch adds the validity check of the newly calculated resource for
avoiding the integer overflow problem.

Bugzilla: http://bugzilla.opensuse.org/show_bug.cgi?id=1086739
Link: http://lkml.kernel.org/r/s5hpo37d5l8.wl-tiwai@suse.de
Fixes: 23c570a67448 ("resource: ability to resize an allocated resource")
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reported-by: Michael Henders <hendersm@shaw.ca>
Tested-by: Michael Henders <hendersm@shaw.ca>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index e270b5048988..2af6c03858b9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old,
 			alloc.start = constraint->alignf(constraint->alignf_data, &avail,
 					size, constraint->align);
 			alloc.end = alloc.start + size - 1;
-			if (resource_contains(&avail, &alloc)) {
+			if (alloc.start <= alloc.end &&
+			    resource_contains(&avail, &alloc)) {
 				new->start = alloc.start;
 				new->end = alloc.end;
 				return 0;
-- 
cgit v1.2.3


From 1cbf29da3628b661379acba7b08a07ef1e5da3b5 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Fri, 13 Apr 2018 15:35:34 -0700
Subject: kexec: export PG_swapbacked to VMCOREINFO

Since commit 6326fec1122c ("mm: Use owner_priv bit for PageSwapCache,
valid when PageSwapBacked"), PG_swapcache is an alias for
PG_owner_priv_1, which may be also used for other purposes.

To know whether the bit indeed has the PG_swapcache meaning, it is
necessary to check PG_swapbacked, hence this bit must be exported.

Link: http://lkml.kernel.org/r/20180410161345.142e142d@ezekiel.suse.cz
Signed-off-by: Petr Tesarik <ptesarik@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: "Marc-Andr Lureau" <marcandre.lureau@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a93590cdd9e1..f7674d676889 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -454,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PG_lru);
 	VMCOREINFO_NUMBER(PG_private);
 	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_swapbacked);
 	VMCOREINFO_NUMBER(PG_slab);
 #ifdef CONFIG_MEMORY_FAILURE
 	VMCOREINFO_NUMBER(PG_hwpoison);
-- 
cgit v1.2.3


From b799a09f639beeda105fe8a9ab440d80fdabd3b3 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Fri, 13 Apr 2018 15:35:45 -0700
Subject: kexec_file: make use of purgatory optional

Patch series "kexec_file, x86, powerpc: refactoring for other
architecutres", v2.

This is a preparatory patchset for adding kexec_file support on arm64.

It was originally included in a arm64 patch set[1], but Philipp is also
working on their kexec_file support on s390[2] and some changes are now
conflicting.

So these common parts were extracted and put into a separate patch set
for better integration.  What's more, my original patch#4 was split into
a few small chunks for easier review after Dave's comment.

As such, the resulting code is basically identical with my original, and
the only *visible* differences are:

 - renaming of _kexec_kernel_image_probe() and  _kimage_file_post_load_cleanup()

 - change one of types of arguments at prepare_elf64_headers()

Those, unfortunately, require a couple of trivial changes on the rest
(#1, #6 to #13) of my arm64 kexec_file patch set[1].

Patch #1 allows making a use of purgatory optional, particularly useful
for arm64.

Patch #2 commonalizes arch_kexec_kernel_{image_probe, image_load,
verify_sig}() and arch_kimage_file_post_load_cleanup() across
architectures.

Patches #3-#7 are also intended to generalize parse_elf64_headers(),
along with exclude_mem_range(), to be made best re-use of.

[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/561182.html
[2] http://lkml.iu.edu//hypermail/linux/kernel/1802.1/02596.html

This patch (of 7):

On arm64, crash dump kernel's usable memory is protected by *unmapping*
it from kernel virtual space unlike other architectures where the region
is just made read-only.  It is highly unlikely that the region is
accidentally corrupted and this observation rationalizes that digest
check code can also be dropped from purgatory.  The resulting code is so
simple as it doesn't require a bit ugly re-linking/relocation stuff,
i.e.  arch_kexec_apply_relocations_add().

Please see:

   http://lists.infradead.org/pipermail/linux-arm-kernel/2017-December/545428.html

All that the purgatory does is to shuffle arguments and jump into a new
kernel, while we still need to have some space for a hash value
(purgatory_sha256_digest) which is never checked against.

As such, it doesn't make sense to have trampline code between old kernel
and new kernel on arm64.

This patch introduces a new configuration, ARCH_HAS_KEXEC_PURGATORY, and
allows related code to be compiled in only if necessary.

[takahiro.akashi@linaro.org: fix trivial screwup]
  Link: http://lkml.kernel.org/r/20180309093346.GF25863@linaro.org
Link: http://lkml.kernel.org/r/20180306102303.9063-2-takahiro.akashi@linaro.org
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Dave Young <dyoung@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e5bcd94c1efb..ab1dced677fd 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -532,6 +532,9 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	struct kexec_sha_region *sha_regions;
 	struct purgatory_info *pi = &image->purgatory_info;
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+		return 0;
+
 	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
 	zero_buf_sz = PAGE_SIZE;
 
@@ -633,6 +636,7 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
 /* Actually load purgatory. Lot of code taken from kexec-tools */
 static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 				  unsigned long max, int top_down)
@@ -1022,3 +1026,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 
 	return 0;
 }
+#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
-- 
cgit v1.2.3


From 9ec4ecef0af7790551109283ca039a7c52de343c Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Fri, 13 Apr 2018 15:35:49 -0700
Subject: kexec_file,x86,powerpc: factor out kexec_file_ops functions

As arch_kexec_kernel_image_{probe,load}(),
arch_kimage_file_post_load_cleanup() and arch_kexec_kernel_verify_sig()
are almost duplicated among architectures, they can be commonalized with
an architecture-defined kexec_file_ops array.  So let's factor them out.

Link: http://lkml.kernel.org/r/20180306102303.9063-3-takahiro.akashi@linaro.org
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Dave Young <dyoung@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ab1dced677fd..332c4fd12cb1 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -28,28 +28,80 @@
 
 static int kexec_calculate_store_digests(struct kimage *image);
 
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+			      unsigned long buf_len)
+{
+	const struct kexec_file_ops * const *fops;
+	int ret = -ENOEXEC;
+
+	for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+		ret = (*fops)->probe(buf, buf_len);
+		if (!ret) {
+			image->fops = *fops;
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
 /* Architectures can provide this probe function */
 int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 					 unsigned long buf_len)
 {
-	return -ENOEXEC;
+	return kexec_image_probe_default(image, buf, buf_len);
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+	if (!image->fops || !image->fops->load)
+		return ERR_PTR(-ENOEXEC);
+
+	return image->fops->load(image, image->kernel_buf,
+				 image->kernel_buf_len, image->initrd_buf,
+				 image->initrd_buf_len, image->cmdline_buf,
+				 image->cmdline_buf_len);
 }
 
 void * __weak arch_kexec_kernel_image_load(struct kimage *image)
 {
-	return ERR_PTR(-ENOEXEC);
+	return kexec_image_load_default(image);
+}
+
+static int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+	if (!image->fops || !image->fops->cleanup)
+		return 0;
+
+	return image->fops->cleanup(image->image_loader_data);
 }
 
 int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
 {
-	return -EINVAL;
+	return kexec_image_post_load_cleanup_default(image);
 }
 
 #ifdef CONFIG_KEXEC_VERIFY_SIG
+static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+					  unsigned long buf_len)
+{
+	if (!image->fops || !image->fops->verify_sig) {
+		pr_debug("kernel loader does not support signature verification.\n");
+		return -EKEYREJECTED;
+	}
+
+	return image->fops->verify_sig(buf, buf_len);
+}
+
 int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 					unsigned long buf_len)
 {
-	return -EKEYREJECTED;
+	return kexec_image_verify_sig_default(image, buf, buf_len);
 }
 #endif
 
-- 
cgit v1.2.3


From babac4a84a88842bec477a5bdada1460f3bc374c Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Fri, 13 Apr 2018 15:36:06 -0700
Subject: kexec_file, x86: move re-factored code to generic side

In the previous patches, commonly-used routines, exclude_mem_range() and
prepare_elf64_headers(), were carved out.  Now place them in kexec
common code.  A prefix "crash_" is given to each of their names to avoid
possible name collisions.

Link: http://lkml.kernel.org/r/20180306102303.9063-8-takahiro.akashi@linaro.org
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Acked-by: Dave Young <dyoung@redhat.com>
Tested-by: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 332c4fd12cb1..b06b1fac5252 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -22,6 +22,11 @@
 #include <linux/ima.h>
 #include <crypto/hash.h>
 #include <crypto/sha.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/vmalloc.h>
 #include "kexec_internal.h"
@@ -1079,3 +1084,173 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 	return 0;
 }
 #endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+			    unsigned long long mstart, unsigned long long mend)
+{
+	int i, j;
+	unsigned long long start, end;
+	struct crash_mem_range temp_range = {0, 0};
+
+	for (i = 0; i < mem->nr_ranges; i++) {
+		start = mem->ranges[i].start;
+		end = mem->ranges[i].end;
+
+		if (mstart > end || mend < start)
+			continue;
+
+		/* Truncate any area outside of range */
+		if (mstart < start)
+			mstart = start;
+		if (mend > end)
+			mend = end;
+
+		/* Found completely overlapping range */
+		if (mstart == start && mend == end) {
+			mem->ranges[i].start = 0;
+			mem->ranges[i].end = 0;
+			if (i < mem->nr_ranges - 1) {
+				/* Shift rest of the ranges to left */
+				for (j = i; j < mem->nr_ranges - 1; j++) {
+					mem->ranges[j].start =
+						mem->ranges[j+1].start;
+					mem->ranges[j].end =
+							mem->ranges[j+1].end;
+				}
+			}
+			mem->nr_ranges--;
+			return 0;
+		}
+
+		if (mstart > start && mend < end) {
+			/* Split original range */
+			mem->ranges[i].end = mstart - 1;
+			temp_range.start = mend + 1;
+			temp_range.end = end;
+		} else if (mstart != start)
+			mem->ranges[i].end = mstart - 1;
+		else
+			mem->ranges[i].start = mend + 1;
+		break;
+	}
+
+	/* If a split happened, add the split to array */
+	if (!temp_range.end)
+		return 0;
+
+	/* Split happened */
+	if (i == mem->max_nr_ranges - 1)
+		return -ENOMEM;
+
+	/* Location where new range should go */
+	j = i + 1;
+	if (j < mem->nr_ranges) {
+		/* Move over all ranges one slot towards the end */
+		for (i = mem->nr_ranges - 1; i >= j; i--)
+			mem->ranges[i + 1] = mem->ranges[i];
+	}
+
+	mem->ranges[j].start = temp_range.start;
+	mem->ranges[j].end = temp_range.end;
+	mem->nr_ranges++;
+	return 0;
+}
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+			  void **addr, unsigned long *sz)
+{
+	Elf64_Ehdr *ehdr;
+	Elf64_Phdr *phdr;
+	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+	unsigned char *buf;
+	unsigned int cpu, i;
+	unsigned long long notes_addr;
+	unsigned long mstart, mend;
+
+	/* extra phdr for vmcoreinfo elf note */
+	nr_phdr = nr_cpus + 1;
+	nr_phdr += mem->nr_ranges;
+
+	/*
+	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+	 * I think this is required by tools like gdb. So same physical
+	 * memory will be mapped in two elf headers. One will contain kernel
+	 * text virtual addresses and other will have __va(physical) addresses.
+	 */
+
+	nr_phdr++;
+	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+	buf = vzalloc(elf_sz);
+	if (!buf)
+		return -ENOMEM;
+
+	ehdr = (Elf64_Ehdr *)buf;
+	phdr = (Elf64_Phdr *)(ehdr + 1);
+	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+	ehdr->e_type = ET_CORE;
+	ehdr->e_machine = ELF_ARCH;
+	ehdr->e_version = EV_CURRENT;
+	ehdr->e_phoff = sizeof(Elf64_Ehdr);
+	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+	ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+	/* Prepare one phdr of type PT_NOTE for each present cpu */
+	for_each_present_cpu(cpu) {
+		phdr->p_type = PT_NOTE;
+		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+		phdr->p_offset = phdr->p_paddr = notes_addr;
+		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+		(ehdr->e_phnum)++;
+		phdr++;
+	}
+
+	/* Prepare one PT_NOTE header for vmcoreinfo */
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+	(ehdr->e_phnum)++;
+	phdr++;
+
+	/* Prepare PT_LOAD type program header for kernel text region */
+	if (kernel_map) {
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_vaddr = (Elf64_Addr)_text;
+		phdr->p_filesz = phdr->p_memsz = _end - _text;
+		phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+		ehdr->e_phnum++;
+		phdr++;
+	}
+
+	/* Go through all the ranges in mem->ranges[] and prepare phdr */
+	for (i = 0; i < mem->nr_ranges; i++) {
+		mstart = mem->ranges[i].start;
+		mend = mem->ranges[i].end;
+
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_offset  = mstart;
+
+		phdr->p_paddr = mstart;
+		phdr->p_vaddr = (unsigned long long) __va(mstart);
+		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+		phdr->p_align = 0;
+		ehdr->e_phnum++;
+		phdr++;
+		pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+			ehdr->e_phnum, phdr->p_offset);
+	}
+
+	*addr = buf;
+	*sz = elf_sz;
+	return 0;
+}
-- 
cgit v1.2.3


From d2b8178ca7324a21495cb71049b4e4a041ab5942 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:13 -0700
Subject: kernel/kexec_file.c: remove checks in kexec_purgatory_load

Before the purgatory is loaded several checks are done whether the ELF
file in kexec_purgatory is valid or not.  These checks are incomplete.
For example they don't check for the total size of the sections defined
in the section header table or if the entry point actually points into
the purgatory.

On the other hand the purgatory, although an ELF file on its own, is
part of the kernel.  Thus not trusting the purgatory means not trusting
the kernel build itself.

So remove all validity checks on the purgatory and just trust the kernel
build.

Link: http://lkml.kernel.org/r/20180321112751.22196-3-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b06b1fac5252..81ba4f782486 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -941,22 +941,8 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	if (kexec_purgatory_size <= 0)
 		return -EINVAL;
 
-	if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-		return -ENOEXEC;
-
 	pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
 
-	if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-	    || pi->ehdr->e_type != ET_REL
-	    || !elf_check_arch(pi->ehdr)
-	    || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-		return -ENOEXEC;
-
-	if (pi->ehdr->e_shoff >= kexec_purgatory_size
-	    || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-	    kexec_purgatory_size - pi->ehdr->e_shoff))
-		return -ENOEXEC;
-
 	ret = __kexec_load_purgatory(image, min, max, top_down);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 65c225d3280542f3ea145e052215ce0538f6bb69 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:17 -0700
Subject: kernel/kexec_file.c: make purgatory_info->ehdr const

The kexec_purgatory buffer is read-only.  Thus all pointers into
kexec_purgatory are read-only, too.  Point this out by explicitly
marking purgatory_info->ehdr as 'const' and update the comments in
purgatory_info.

Link: http://lkml.kernel.org/r/20180321112751.22196-4-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 81ba4f782486..12cf9c9ff0bc 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -941,7 +941,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	if (kexec_purgatory_size <= 0)
 		return -EINVAL;
 
-	pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+	pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
 
 	ret = __kexec_load_purgatory(image, min, max, top_down);
 	if (ret)
@@ -965,9 +965,9 @@ out:
 static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
 					    const char *name)
 {
+	const Elf_Ehdr *ehdr;
 	Elf_Sym *syms;
 	Elf_Shdr *sechdrs;
-	Elf_Ehdr *ehdr;
 	int i, k;
 	const char *strtab;
 
-- 
cgit v1.2.3


From 961d921a1b967f76e13f9e11f2b0c2bcb5741f10 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:21 -0700
Subject: kernel/kexec_file.c: search symbols in read-only kexec_purgatory

The stripped purgatory does not contain a symtab.  So when looking for
symbols this is done in read-only kexec_purgatory.  Highlight this by
marking the corresponding variables as 'const'.

Link: http://lkml.kernel.org/r/20180321112751.22196-5-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 12cf9c9ff0bc..9bd1ec1dd875 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -962,20 +962,27 @@ out:
 	return ret;
 }
 
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-					    const char *name)
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi:		Purgatory to search in.
+ * @name:	Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+						  const char *name)
 {
+	const Elf_Shdr *sechdrs;
 	const Elf_Ehdr *ehdr;
-	Elf_Sym *syms;
-	Elf_Shdr *sechdrs;
-	int i, k;
+	const Elf_Sym *syms;
 	const char *strtab;
+	int i, k;
 
-	if (!pi->sechdrs || !pi->ehdr)
+	if (!pi->ehdr)
 		return NULL;
 
-	sechdrs = pi->sechdrs;
 	ehdr = pi->ehdr;
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
 
 	for (i = 0; i < ehdr->e_shnum; i++) {
 		if (sechdrs[i].sh_type != SHT_SYMTAB)
@@ -984,8 +991,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
 		if (sechdrs[i].sh_link >= ehdr->e_shnum)
 			/* Invalid strtab section number */
 			continue;
-		strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-		syms = (Elf_Sym *)sechdrs[i].sh_offset;
+		strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+		syms = (void *)ehdr + sechdrs[i].sh_offset;
 
 		/* Go through symbols for a match */
 		for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
@@ -1013,7 +1020,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
 {
 	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Sym *sym;
+	const Elf_Sym *sym;
 	Elf_Shdr *sechdr;
 
 	sym = kexec_purgatory_find_symbol(pi, name);
@@ -1036,9 +1043,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
 int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 				   void *buf, unsigned int size, bool get_value)
 {
-	Elf_Sym *sym;
-	Elf_Shdr *sechdrs;
 	struct purgatory_info *pi = &image->purgatory_info;
+	const Elf_Sym *sym;
+	Elf_Shdr *sec;
 	char *sym_buf;
 
 	sym = kexec_purgatory_find_symbol(pi, name);
@@ -1051,16 +1058,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 		return -EINVAL;
 	}
 
-	sechdrs = pi->sechdrs;
+	sec = pi->sechdrs + sym->st_shndx;
 
-	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+	if (sec->sh_type == SHT_NOBITS) {
 		pr_err("symbol %s is in a bss section. Cannot %s\n", name,
 		       get_value ? "get" : "set");
 		return -EINVAL;
 	}
 
-	sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-					sym->st_value;
+	sym_buf = (char *)sec->sh_offset + sym->st_value;
 
 	if (get_value)
 		memcpy((void *)buf, sym_buf, size);
-- 
cgit v1.2.3


From 8aec395b8478310521031157ef5d44ef19c2c581 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:24 -0700
Subject: kernel/kexec_file.c: use read-only sections in
 arch_kexec_apply_relocations*

When the relocations are applied to the purgatory only the section the
relocations are applied to is writable.  The other sections, i.e.  the
symtab and .rel/.rela, are in read-only kexec_purgatory.  Highlight this
by marking the corresponding variables as 'const'.

While at it also change the signatures of arch_kexec_apply_relocations* to
take section pointers instead of just the index of the relocation section.
This removes the second lookup and sanity check of the sections in arch
code.

Link: http://lkml.kernel.org/r/20180321112751.22196-6-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 63 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9bd1ec1dd875..5c70f7f2bae3 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -110,19 +110,35 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 }
 #endif
 
-/* Apply relocations of type RELA */
+/*
+ * arch_kexec_apply_relocations_add - apply relocations of type RELA
+ * @pi:		Purgatory to be relocated.
+ * @section:	Section relocations applying to.
+ * @relsec:	Section containing RELAs.
+ * @symtab:	Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
 int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-				 unsigned int relsec)
+arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
+				 const Elf_Shdr *relsec, const Elf_Shdr *symtab)
 {
 	pr_err("RELA relocation unsupported.\n");
 	return -ENOEXEC;
 }
 
-/* Apply relocations of type REL */
+/*
+ * arch_kexec_apply_relocations - apply relocations of type REL
+ * @pi:		Purgatory to be relocated.
+ * @section:	Section relocations applying to.
+ * @relsec:	Section containing RELs.
+ * @symtab:	Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
 int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-			     unsigned int relsec)
+arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
+			     const Elf_Shdr *relsec, const Elf_Shdr *symtab)
 {
 	pr_err("REL relocation unsupported.\n");
 	return -ENOEXEC;
@@ -879,14 +895,19 @@ static int kexec_apply_relocations(struct kimage *image)
 {
 	int i, ret;
 	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Shdr *sechdrs = pi->sechdrs;
+	const Elf_Shdr *sechdrs;
+
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
 
-	/* Apply relocations */
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		Elf_Shdr *section, *symtab;
+		const Elf_Shdr *relsec;
+		const Elf_Shdr *symtab;
+		Elf_Shdr *section;
+
+		relsec = sechdrs + i;
 
-		if (sechdrs[i].sh_type != SHT_RELA &&
-		    sechdrs[i].sh_type != SHT_REL)
+		if (relsec->sh_type != SHT_RELA &&
+		    relsec->sh_type != SHT_REL)
 			continue;
 
 		/*
@@ -895,12 +916,12 @@ static int kexec_apply_relocations(struct kimage *image)
 		 * symbol table. And ->sh_info contains section header
 		 * index of section to which relocations apply.
 		 */
-		if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-		    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+		if (relsec->sh_info >= pi->ehdr->e_shnum ||
+		    relsec->sh_link >= pi->ehdr->e_shnum)
 			return -ENOEXEC;
 
-		section = &sechdrs[sechdrs[i].sh_info];
-		symtab = &sechdrs[sechdrs[i].sh_link];
+		section = pi->sechdrs + relsec->sh_info;
+		symtab = sechdrs + relsec->sh_link;
 
 		if (!(section->sh_flags & SHF_ALLOC))
 			continue;
@@ -917,12 +938,12 @@ static int kexec_apply_relocations(struct kimage *image)
 		 * Respective architecture needs to provide support for applying
 		 * relocations of type SHT_RELA/SHT_REL.
 		 */
-		if (sechdrs[i].sh_type == SHT_RELA)
-			ret = arch_kexec_apply_relocations_add(pi->ehdr,
-							       sechdrs, i);
-		else if (sechdrs[i].sh_type == SHT_REL)
-			ret = arch_kexec_apply_relocations(pi->ehdr,
-							   sechdrs, i);
+		if (relsec->sh_type == SHT_RELA)
+			ret = arch_kexec_apply_relocations_add(pi, section,
+							       relsec, symtab);
+		else if (relsec->sh_type == SHT_REL)
+			ret = arch_kexec_apply_relocations(pi, section,
+							   relsec, symtab);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.2.3


From 930457057abe4e6d57433dea75e97e0e39fd0ab6 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:28 -0700
Subject: kernel/kexec_file.c: split up __kexec_load_puragory

When inspecting __kexec_load_purgatory you find that it has two tasks

	1) setting up the kexec_buffer for the new kernel and,
	2) setting up pi->sechdrs for the final load address.

The two tasks are independent of each other.  To improve readability
split up __kexec_load_purgatory into two functions, one for each task,
and call them directly from kexec_load_purgatory.

Link: http://lkml.kernel.org/r/20180321112751.22196-7-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 200 +++++++++++++++++++++++++++-------------------------
 1 file changed, 103 insertions(+), 97 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 5c70f7f2bae3..878b97bd3067 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -710,39 +710,97 @@ out:
 }
 
 #ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-				  unsigned long max, int top_down)
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi:		Purgatory to be loaded.
+ * @kbuf:	Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+				      struct kexec_buf *kbuf)
 {
-	struct purgatory_info *pi = &image->purgatory_info;
-	unsigned long align, bss_align, bss_sz, bss_pad;
-	unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
-	unsigned char *buf_addr, *src;
-	int i, ret = 0, entry_sidx = -1;
-	const Elf_Shdr *sechdrs_c;
-	Elf_Shdr *sechdrs = NULL;
-	struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
-				  .buf_min = min, .buf_max = max,
-				  .top_down = top_down };
+	const Elf_Shdr *sechdrs;
+	unsigned long bss_align;
+	unsigned long bss_sz;
+	unsigned long align;
+	int i, ret;
 
-	/*
-	 * sechdrs_c points to section headers in purgatory and are read
-	 * only. No modifications allowed.
-	 */
-	sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+	bss_align = 1;
+	bss_sz = 0;
+
+	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		align = sechdrs[i].sh_addralign;
+		if (sechdrs[i].sh_type != SHT_NOBITS) {
+			if (kbuf->buf_align < align)
+				kbuf->buf_align = align;
+			kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+			kbuf->bufsz += sechdrs[i].sh_size;
+		} else {
+			if (bss_align < align)
+				bss_align = align;
+			bss_sz = ALIGN(bss_sz, align);
+			bss_sz += sechdrs[i].sh_size;
+		}
+	}
+	kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+	kbuf->memsz = kbuf->bufsz + bss_sz;
+	if (kbuf->buf_align < bss_align)
+		kbuf->buf_align = bss_align;
+
+	kbuf->buffer = vzalloc(kbuf->bufsz);
+	if (!kbuf->buffer)
+		return -ENOMEM;
+	pi->purgatory_buf = kbuf->buffer;
+
+	ret = kexec_add_buffer(kbuf);
+	if (ret)
+		goto out;
+	pi->purgatory_load_addr = kbuf->mem;
+
+	return 0;
+out:
+	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
+	return ret;
+}
+
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi:		Purgatory to be loaded.
+ * @kbuf:	Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+					 struct kexec_buf *kbuf)
+{
+	unsigned long curr_load_addr;
+	unsigned long load_addr;
+	unsigned long bss_addr;
+	unsigned long offset;
+	unsigned char *buf_addr;
+	unsigned char *src;
+	Elf_Shdr *sechdrs;
+	int entry_sidx = -1;
+	int i;
 
-	/*
-	 * We can not modify sechdrs_c[] and its fields. It is read only.
-	 * Copy it over to a local copy where one can store some temporary
-	 * data and free it at the end. We need to modify ->sh_addr and
-	 * ->sh_offset fields to keep track of permanent and temporary
-	 * locations of sections.
-	 */
 	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
 	if (!sechdrs)
 		return -ENOMEM;
-
-	memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+	memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
+	       pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+	pi->sechdrs = sechdrs;
 
 	/*
 	 * We seem to have multiple copies of sections. First copy is which
@@ -770,7 +828,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 	 * Identify entry point section and make entry relative to section
 	 * start.
 	 */
-	entry = pi->ehdr->e_entry;
+	kbuf->image->start = pi->ehdr->e_entry;
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
@@ -783,63 +841,19 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
 		     pi->ehdr->e_entry)) {
 			entry_sidx = i;
-			entry -= sechdrs[i].sh_addr;
+			kbuf->image->start -= sechdrs[i].sh_addr;
 			break;
 		}
 	}
 
-	/* Determine how much memory is needed to load relocatable object. */
-	bss_align = 1;
-	bss_sz = 0;
-
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		align = sechdrs[i].sh_addralign;
-		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			if (kbuf.buf_align < align)
-				kbuf.buf_align = align;
-			kbuf.bufsz = ALIGN(kbuf.bufsz, align);
-			kbuf.bufsz += sechdrs[i].sh_size;
-		} else {
-			/* bss section */
-			if (bss_align < align)
-				bss_align = align;
-			bss_sz = ALIGN(bss_sz, align);
-			bss_sz += sechdrs[i].sh_size;
-		}
-	}
-
-	/* Determine the bss padding required to align bss properly */
-	bss_pad = 0;
-	if (kbuf.bufsz & (bss_align - 1))
-		bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
-
-	kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
-
-	/* Allocate buffer for purgatory */
-	kbuf.buffer = vzalloc(kbuf.bufsz);
-	if (!kbuf.buffer) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if (kbuf.buf_align < bss_align)
-		kbuf.buf_align = bss_align;
-
-	/* Add buffer to segment list */
-	ret = kexec_add_buffer(&kbuf);
-	if (ret)
-		goto out;
-	pi->purgatory_load_addr = kbuf.mem;
-
 	/* Load SHF_ALLOC sections */
-	buf_addr = kbuf.buffer;
-	load_addr = curr_load_addr = pi->purgatory_load_addr;
-	bss_addr = load_addr + kbuf.bufsz + bss_pad;
+	buf_addr = kbuf->buffer;
+	load_addr = curr_load_addr = kbuf->mem;
+	bss_addr = load_addr + kbuf->bufsz;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		unsigned long align;
+
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 
@@ -871,24 +885,9 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
 	/* Update entry point based on load address of text section */
 	if (entry_sidx >= 0)
-		entry += sechdrs[entry_sidx].sh_addr;
-
-	/* Make kernel jump to purgatory after shutdown */
-	image->start = entry;
-
-	/* Used later to get/set symbol values */
-	pi->sechdrs = sechdrs;
+		kbuf->image->start += sechdrs[entry_sidx].sh_addr;
 
-	/*
-	 * Used later to identify which section is purgatory and skip it
-	 * from checksumming.
-	 */
-	pi->purgatory_buf = kbuf.buffer;
-	return ret;
-out:
-	vfree(sechdrs);
-	vfree(kbuf.buffer);
-	return ret;
+	return 0;
 }
 
 static int kexec_apply_relocations(struct kimage *image)
@@ -958,16 +957,23 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 {
 	struct purgatory_info *pi = &image->purgatory_info;
 	int ret;
+	struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
+				  .buf_min = min, .buf_max = max,
+				  .top_down = top_down };
 
 	if (kexec_purgatory_size <= 0)
 		return -EINVAL;
 
 	pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
 
-	ret = __kexec_load_purgatory(image, min, max, top_down);
+	ret = kexec_purgatory_setup_kbuf(pi, &kbuf);
 	if (ret)
 		return ret;
 
+	ret = kexec_purgatory_setup_sechdrs(pi, &kbuf);
+	if (ret)
+		goto out_free_kbuf;
+
 	ret = kexec_apply_relocations(image);
 	if (ret)
 		goto out;
@@ -977,7 +983,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 out:
 	vfree(pi->sechdrs);
 	pi->sechdrs = NULL;
-
+out_free_kbuf:
 	vfree(pi->purgatory_buf);
 	pi->purgatory_buf = NULL;
 	return ret;
-- 
cgit v1.2.3


From f1b1cca39650c9c1dbe26140946a518953f66771 Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:32 -0700
Subject: kernel/kexec_file.c: remove unneeded for-loop in
 kexec_purgatory_setup_sechdrs

To update the entry point there is an extra loop over all section
headers although this can be done in the main loop.  So move it there
and eliminate the extra loop and variable to store the 'entry section
index'.

Also, in the main loop, move the usual case, i.e.  non-bss section, out
of the extra if-block.

Link: http://lkml.kernel.org/r/20180321112751.22196-8-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 76 +++++++++++++++++++++--------------------------------
 1 file changed, 30 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 878b97bd3067..6f0ec9f2e5c1 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -792,7 +792,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 	unsigned char *buf_addr;
 	unsigned char *src;
 	Elf_Shdr *sechdrs;
-	int entry_sidx = -1;
 	int i;
 
 	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
@@ -824,32 +823,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 						sechdrs[i].sh_offset;
 	}
 
-	/*
-	 * Identify entry point section and make entry relative to section
-	 * start.
-	 */
-	kbuf->image->start = pi->ehdr->e_entry;
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-			continue;
-
-		/* Make entry section relative */
-		if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-		     pi->ehdr->e_entry)) {
-			entry_sidx = i;
-			kbuf->image->start -= sechdrs[i].sh_addr;
-			break;
-		}
-	}
-
 	/* Load SHF_ALLOC sections */
 	buf_addr = kbuf->buffer;
 	load_addr = curr_load_addr = kbuf->mem;
 	bss_addr = load_addr + kbuf->bufsz;
+	kbuf->image->start = pi->ehdr->e_entry;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		unsigned long align;
@@ -858,34 +836,40 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 			continue;
 
 		align = sechdrs[i].sh_addralign;
-		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			curr_load_addr = ALIGN(curr_load_addr, align);
-			offset = curr_load_addr - load_addr;
-			/* We already modifed ->sh_offset to keep src addr */
-			src = (char *) sechdrs[i].sh_offset;
-			memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-			/* Store load address and source address of section */
-			sechdrs[i].sh_addr = curr_load_addr;
-
-			/*
-			 * This section got copied to temporary buffer. Update
-			 * ->sh_offset accordingly.
-			 */
-			sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-			/* Advance to the next address */
-			curr_load_addr += sechdrs[i].sh_size;
-		} else {
+
+		if (sechdrs[i].sh_type == SHT_NOBITS) {
 			bss_addr = ALIGN(bss_addr, align);
 			sechdrs[i].sh_addr = bss_addr;
 			bss_addr += sechdrs[i].sh_size;
+			continue;
+		}
+
+		curr_load_addr = ALIGN(curr_load_addr, align);
+		offset = curr_load_addr - load_addr;
+		/* We already modifed ->sh_offset to keep src addr */
+		src = (char *)sechdrs[i].sh_offset;
+		memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+		if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+		    pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+		    pi->ehdr->e_entry < (sechdrs[i].sh_addr
+					 + sechdrs[i].sh_size)) {
+			kbuf->image->start -= sechdrs[i].sh_addr;
+			kbuf->image->start += curr_load_addr;
 		}
-	}
 
-	/* Update entry point based on load address of text section */
-	if (entry_sidx >= 0)
-		kbuf->image->start += sechdrs[entry_sidx].sh_addr;
+		/* Store load address and source address of section */
+		sechdrs[i].sh_addr = curr_load_addr;
+
+		/*
+		 * This section got copied to temporary buffer. Update
+		 * ->sh_offset accordingly.
+		 */
+		sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+		/* Advance to the next address */
+		curr_load_addr += sechdrs[i].sh_size;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 620f697cc27a6d9b09268f47cd13620488ec67af Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:35 -0700
Subject: kernel/kexec_file.c: remove unneeded variables in
 kexec_purgatory_setup_sechdrs

The main loop currently uses quite a lot of variables to update the
section headers.  Some of them are unnecessary.  So clean them up a
little.

Link: http://lkml.kernel.org/r/20180321112751.22196-9-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6f0ec9f2e5c1..7b63de8a89b6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -785,12 +785,8 @@ out:
 static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 					 struct kexec_buf *kbuf)
 {
-	unsigned long curr_load_addr;
-	unsigned long load_addr;
 	unsigned long bss_addr;
 	unsigned long offset;
-	unsigned char *buf_addr;
-	unsigned char *src;
 	Elf_Shdr *sechdrs;
 	int i;
 
@@ -823,20 +819,18 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 						sechdrs[i].sh_offset;
 	}
 
-	/* Load SHF_ALLOC sections */
-	buf_addr = kbuf->buffer;
-	load_addr = curr_load_addr = kbuf->mem;
-	bss_addr = load_addr + kbuf->bufsz;
+	offset = 0;
+	bss_addr = kbuf->mem + kbuf->bufsz;
 	kbuf->image->start = pi->ehdr->e_entry;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		unsigned long align;
+		void *src, *dst;
 
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 
 		align = sechdrs[i].sh_addralign;
-
 		if (sechdrs[i].sh_type == SHT_NOBITS) {
 			bss_addr = ALIGN(bss_addr, align);
 			sechdrs[i].sh_addr = bss_addr;
@@ -844,31 +838,27 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 			continue;
 		}
 
-		curr_load_addr = ALIGN(curr_load_addr, align);
-		offset = curr_load_addr - load_addr;
-		/* We already modifed ->sh_offset to keep src addr */
-		src = (char *)sechdrs[i].sh_offset;
-		memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
+		offset = ALIGN(offset, align);
 		if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
 		    pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
 		    pi->ehdr->e_entry < (sechdrs[i].sh_addr
 					 + sechdrs[i].sh_size)) {
 			kbuf->image->start -= sechdrs[i].sh_addr;
-			kbuf->image->start += curr_load_addr;
+			kbuf->image->start += kbuf->mem + offset;
 		}
 
-		/* Store load address and source address of section */
-		sechdrs[i].sh_addr = curr_load_addr;
+		src = (void *)sechdrs[i].sh_offset;
+		dst = pi->purgatory_buf + offset;
+		memcpy(dst, src, sechdrs[i].sh_size);
+
+		sechdrs[i].sh_addr = kbuf->mem + offset;
 
 		/*
 		 * This section got copied to temporary buffer. Update
 		 * ->sh_offset accordingly.
 		 */
-		sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-		/* Advance to the next address */
-		curr_load_addr += sechdrs[i].sh_size;
+		sechdrs[i].sh_offset = (unsigned long)dst;
+		offset += sechdrs[i].sh_size;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 8da0b724959ccd3f8435214ebdaf1aef548967bb Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:39 -0700
Subject: kernel/kexec_file.c: remove mis-use of sh_offset field during
 purgatory load

The current code uses the sh_offset field in purgatory_info->sechdrs to
store a pointer to the current load address of the section.  Depending
whether the section will be loaded or not this is either a pointer into
purgatory_info->purgatory_buf or kexec_purgatory.  This is not only a
violation of the ELF standard but also makes the code very hard to
understand as you cannot tell if the memory you are using is read-only
or not.

Remove this misuse and store the offset of the section in
pugaroty_info->purgatory_buf in sh_offset.

Link: http://lkml.kernel.org/r/20180321112751.22196-10-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 7b63de8a89b6..269116fd932c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -790,6 +790,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 	Elf_Shdr *sechdrs;
 	int i;
 
+	/*
+	 * The section headers in kexec_purgatory are read-only. In order to
+	 * have them modifiable make a temporary copy.
+	 */
 	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
 	if (!sechdrs)
 		return -ENOMEM;
@@ -797,28 +801,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 	       pi->ehdr->e_shnum * sizeof(Elf_Shdr));
 	pi->sechdrs = sechdrs;
 
-	/*
-	 * We seem to have multiple copies of sections. First copy is which
-	 * is embedded in kernel in read only section. Some of these sections
-	 * will be copied to a temporary buffer and relocated. And these
-	 * sections will finally be copied to their final destination at
-	 * segment load time.
-	 *
-	 * Use ->sh_offset to reflect section address in memory. It will
-	 * point to original read only copy if section is not allocatable.
-	 * Otherwise it will point to temporary copy which will be relocated.
-	 *
-	 * Use ->sh_addr to contain final address of the section where it
-	 * will go during execution time.
-	 */
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (sechdrs[i].sh_type == SHT_NOBITS)
-			continue;
-
-		sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-						sechdrs[i].sh_offset;
-	}
-
 	offset = 0;
 	bss_addr = kbuf->mem + kbuf->bufsz;
 	kbuf->image->start = pi->ehdr->e_entry;
@@ -847,17 +829,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
 			kbuf->image->start += kbuf->mem + offset;
 		}
 
-		src = (void *)sechdrs[i].sh_offset;
+		src = (void *)pi->ehdr + sechdrs[i].sh_offset;
 		dst = pi->purgatory_buf + offset;
 		memcpy(dst, src, sechdrs[i].sh_size);
 
 		sechdrs[i].sh_addr = kbuf->mem + offset;
-
-		/*
-		 * This section got copied to temporary buffer. Update
-		 * ->sh_offset accordingly.
-		 */
-		sechdrs[i].sh_offset = (unsigned long)dst;
+		sechdrs[i].sh_offset = offset;
 		offset += sechdrs[i].sh_size;
 	}
 
@@ -1067,7 +1044,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 		return -EINVAL;
 	}
 
-	sym_buf = (char *)sec->sh_offset + sym->st_value;
+	sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
 
 	if (get_value)
 		memcpy((void *)buf, sym_buf, size);
-- 
cgit v1.2.3


From 3be3f61d25e04ecf90d65d52fad632af5ba8805b Mon Sep 17 00:00:00 2001
From: Philipp Rudo <prudo@linux.vnet.ibm.com>
Date: Fri, 13 Apr 2018 15:36:43 -0700
Subject: kernel/kexec_file.c: allow archs to set purgatory load address

For s390 new kernels are loaded to fixed addresses in memory before they
are booted.  With the current code this is a problem as it assumes the
kernel will be loaded to an 'arbitrary' address.  In particular,
kexec_locate_mem_hole searches for a large enough memory region and sets
the load address (kexec_bufer->mem) to it.

Luckily there is a simple workaround for this problem.  By returning 1
in arch_kexec_walk_mem, kexec_locate_mem_hole is turned off.  This
allows the architecture to set kbuf->mem by hand.  While the trick works
fine for the kernel it does not for the purgatory as here the
architectures don't have access to its kexec_buffer.

Give architectures access to the purgatories kexec_buffer by changing
kexec_load_purgatory to take a pointer to it.  With this change
architectures have access to the buffer and can edit it as they need.

A nice side effect of this change is that we can get rid of the
purgatory_info->purgatory_load_address field.  As now the information
stored there can directly be accessed from kbuf->mem.

Link: http://lkml.kernel.org/r/20180321112751.22196-11-prudo@linux.vnet.ibm.com
Signed-off-by: Philipp Rudo <prudo@linux.vnet.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 269116fd932c..75d8e7cf040e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -730,8 +730,8 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
 	int i, ret;
 
 	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
-	bss_align = 1;
-	bss_sz = 0;
+	kbuf->buf_align = bss_align = 1;
+	kbuf->bufsz = bss_sz = 0;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -763,7 +763,6 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
 	ret = kexec_add_buffer(kbuf);
 	if (ret)
 		goto out;
-	pi->purgatory_load_addr = kbuf->mem;
 
 	return 0;
 out:
@@ -901,27 +900,32 @@ static int kexec_apply_relocations(struct kimage *image)
 	return 0;
 }
 
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-			 unsigned long max, int top_down,
-			 unsigned long *load_addr)
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image:	Image to add the purgatory to.
+ * @kbuf:	Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
 {
 	struct purgatory_info *pi = &image->purgatory_info;
 	int ret;
-	struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
-				  .buf_min = min, .buf_max = max,
-				  .top_down = top_down };
 
 	if (kexec_purgatory_size <= 0)
 		return -EINVAL;
 
 	pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
 
-	ret = kexec_purgatory_setup_kbuf(pi, &kbuf);
+	ret = kexec_purgatory_setup_kbuf(pi, kbuf);
 	if (ret)
 		return ret;
 
-	ret = kexec_purgatory_setup_sechdrs(pi, &kbuf);
+	ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
 	if (ret)
 		goto out_free_kbuf;
 
@@ -929,7 +933,6 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	if (ret)
 		goto out;
 
-	*load_addr = pi->purgatory_load_addr;
 	return 0;
 out:
 	vfree(pi->sechdrs);
-- 
cgit v1.2.3


From e91c2518a5d22a07642f35d85f39001ad379dae4 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:46 +0200
Subject: livepatch: Initialize shadow variables safely by a custom callback

The existing API allows to pass a sample data to initialize the shadow
data. It works well when the data are position independent. But it fails
miserably when we need to set a pointer to the shadow structure itself.

Unfortunately, we might need to initialize the pointer surprisingly
often because of struct list_head. It is even worse because the list
might be hidden in other common structures, for example, struct mutex,
struct wait_queue_head.

For example, this was needed to fix races in ALSA sequencer. It required
to add mutex into struct snd_seq_client. See commit b3defb791b26ea06
("ALSA: seq: Make ioctls race-free") and commit d15d662e89fc667b9
("ALSA: seq: Fix racy pool initializations")

This patch makes the API more safe. A custom constructor function and data
are passed to klp_shadow_*alloc() functions instead of the sample data.

Note that ctor_data are no longer a template for shadow->data. It might
point to any data that might be necessary when the constructor is called.

Also note that the constructor is called under klp_shadow_lock. It is
an internal spin_lock that synchronizes alloc() vs. get() operations,
see klp_shadow_get_or_alloc(). On one hand, this adds a risk of ABBA
deadlocks. On the other hand, it allows to do some operations safely.
For example, we could add the new structure into an existing list.
This must be done only once when the structure is allocated.

Reported-by: Nicolai Stange <nstange@suse.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 82 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index fdac27588d60..b10a0bbb7f84 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+				       size_t size, gfp_t gfp_flags,
+				       klp_shadow_ctor_t ctor, void *ctor_data,
+				       bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
 	void *shadow_data;
@@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 	if (shadow_data)
 		goto exists;
 
-	/* Allocate a new shadow variable for use inside the lock below */
+	/*
+	 * Allocate a new shadow variable.  Fill it with zeroes by default.
+	 * More complex setting can be done by @ctor function.  But it is
+	 * called only when the buffer is really used (under klp_shadow_lock).
+	 */
 	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
 	if (!new_shadow)
 		return NULL;
 
-	new_shadow->obj = obj;
-	new_shadow->id = id;
-
-	/* Initialize the shadow variable if data provided */
-	if (data)
-		memcpy(new_shadow->data, data, size);
-
 	/* Look for <obj, id> again under the lock */
 	spin_lock_irqsave(&klp_shadow_lock, flags);
 	shadow_data = klp_shadow_get(obj, id);
@@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		goto exists;
 	}
 
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	if (ctor) {
+		int err;
+
+		err = ctor(obj, new_shadow->data, ctor_data);
+		if (err) {
+			spin_unlock_irqrestore(&klp_shadow_lock, flags);
+			kfree(new_shadow);
+			pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+			       obj, id, err);
+			return NULL;
+		}
+	}
+
 	/* No <obj, id> found, so attach the newly allocated one */
 	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
 		     (unsigned long)new_shadow->obj);
@@ -170,26 +185,32 @@ exists:
  * klp_shadow_alloc() - allocate and add a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default.  They are further initialized by @ctor
+ * function if it is not NULL.  The new shadow variable is then added
+ * to the global hashtable.
  *
- * Allocates @size bytes for new shadow variable data using @gfp_flags
- * and copies @size bytes from @data into the new shadow variable's own
- * data space.  If @data is NULL, @size bytes are still allocated, but
- * no copy is performed.  The new shadow variable is then added to the
- * global hashtable.
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
  *
- * If an existing <obj, id> shadow variable can be found, this routine
- * will issue a WARN, exit early and return NULL.
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before.  The cost is that @ctor is called
+ * in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on duplicate or
  * failure.
  */
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags)
+void *klp_shadow_alloc(void *obj, unsigned long id,
+		       size_t size, gfp_t gfp_flags,
+		       klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, true);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_alloc);
 
@@ -197,25 +218,28 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc);
  * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
  *
  * Returns a pointer to existing shadow data if an <obj, id> shadow
  * variable is already present.  Otherwise, it creates a new shadow
  * variable like klp_shadow_alloc().
  *
- * This function guarantees that only one shadow variable exists with
- * the given @id for the given @obj.  It also guarantees that the shadow
- * variable will be initialized by the given @data only when it did not
- * exist before.
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj.  It also guarantees that the constructor function
+ * will be called only when the variable did not exist before.  The cost is
+ * that @ctor is called in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on failure.
  */
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-			       size_t size, gfp_t gfp_flags)
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+			      size_t size, gfp_t gfp_flags,
+			      klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, false);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
-- 
cgit v1.2.3


From 3b2c77d000fe9f7d02e9e726e00dccf9f92b256f Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 16 Apr 2018 13:36:47 +0200
Subject: livepatch: Allow to call a custom callback when freeing shadow
 variables

We might need to do some actions before the shadow variable is freed.
For example, we might need to remove it from a list or free some data
that it points to.

This is already possible now. The user can get the shadow variable
by klp_shadow_get(), do the necessary actions, and then call
klp_shadow_free().

This patch allows to do it a more elegant way. The user could implement
the needed actions in a callback that is passed to klp_shadow_free()
as a parameter. The callback usually does reverse operations to
the constructor callback that can be called by klp_shadow_*alloc().

It is especially useful for klp_shadow_free_all(). There we need to do
these extra actions for each found shadow variable with the given ID.

Note that the memory used by the shadow variable itself is still released
later by rcu callback. It is needed to protect internal structures that
keep all shadow variables. But the destructor is called immediately.
The shadow variable must not be access anyway after klp_shadow_free()
is called. The user is responsible to protect this any suitable way.

Be aware that the destructor is called under klp_shadow_lock. It is
the same as for the contructor in klp_shadow_alloc().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index b10a0bbb7f84..83958c814439 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -243,15 +243,26 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+				   klp_shadow_dtor_t dtor)
+{
+	hash_del_rcu(&shadow->node);
+	if (dtor)
+		dtor(shadow->obj, shadow->data);
+	kfree_rcu(shadow, rcu_head);
+}
+
 /**
  * klp_shadow_free() - detach and free a <obj, id> shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for this <obj, id> shadow variable
  * instance, callers should stop referencing it accordingly.
  */
-void klp_shadow_free(void *obj, unsigned long id)
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -263,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id)
 			       (unsigned long)obj) {
 
 		if (klp_shadow_match(shadow, obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
+			klp_shadow_free_struct(shadow, dtor);
 			break;
 		}
 	}
@@ -276,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free);
 /**
  * klp_shadow_free_all() - detach and free all <*, id> shadow variables
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for all <*, id> shadow variable
  * instances, callers should stop referencing them accordingly.
  */
-void klp_shadow_free_all(unsigned long id)
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -290,10 +302,8 @@ void klp_shadow_free_all(unsigned long id)
 
 	/* Delete all <*, id> from hash */
 	hash_for_each(klp_shadow_hash, i, shadow, node) {
-		if (klp_shadow_match(shadow, shadow->obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
-		}
+		if (klp_shadow_match(shadow, shadow->obj, id))
+			klp_shadow_free_struct(shadow, dtor);
 	}
 
 	spin_unlock_irqrestore(&klp_shadow_lock, flags);
-- 
cgit v1.2.3