From 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 Mon Sep 17 00:00:00 2001 From: Jules Maselbas Date: Thu, 29 Mar 2018 15:43:01 +0100 Subject: sched/cpufreq/schedutil: Fix error path mutex unlock This patch prevents the 'global_tunables_lock' mutex from being unlocked before being locked. This mutex is not locked if the sugov_kthread_create() function fails. Signed-off-by: Jules Maselbas Acked-by: Peter Zijlstra Cc: Chris Redpath Cc: Dietmar Eggermann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Patrick Bellasi Cc: Stephen Kyle Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: nd@arm.com Link: http://lkml.kernel.org/r/20180329144301.38419-1-jules.maselbas@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7936f548e071..617c6741c525 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -625,10 +625,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: -- cgit v1.2.3 From d29a20645d5e929aa7e8616f28e5d8e1c49263ec Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 2 Apr 2018 09:49:54 -0700 Subject: sched/rt: Fix rq->clock_update_flags < RQCF_ACT_SKIP warning While running rt-tests' pi_stress program I got the following splat: rq->clock_update_flags < RQCF_ACT_SKIP WARNING: CPU: 27 PID: 0 at kernel/sched/sched.h:960 assert_clock_updated.isra.38.part.39+0x13/0x20 [...] enqueue_top_rt_rq+0xf4/0x150 ? cpufreq_dbs_governor_start+0x170/0x170 sched_rt_rq_enqueue+0x65/0x80 sched_rt_period_timer+0x156/0x360 ? sched_rt_rq_enqueue+0x80/0x80 __hrtimer_run_queues+0xfa/0x260 hrtimer_interrupt+0xcb/0x220 smp_apic_timer_interrupt+0x62/0x120 apic_timer_interrupt+0xf/0x20 [...] do_idle+0x183/0x1e0 cpu_startup_entry+0x5f/0x70 start_secondary+0x192/0x1d0 secondary_startup_64+0xa5/0xb0 We can get rid of it be the "traditional" means of adding an update_rq_clock() call after acquiring the rq->lock in do_sched_rt_period_timer(). The case for the RT task throttling (which this workload also hits) can be ignored in that the skip_update call is actually bogus and quite the contrary (the request bits are removed/reverted). By setting RQCF_UPDATED we really don't care if the skip is happening or not and will therefore make the assert_clock_updated() check happy. Signed-off-by: Davidlohr Bueso Reviewed-by: Matt Fleming Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: dave@stgolabs.net Cc: linux-kernel@vger.kernel.org Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180402164954.16255-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 86b77987435e..ad13e6242481 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) continue; raw_spin_lock(&rq->lock); + update_rq_clock(rq); + if (rt_rq->rt_time) { u64 runtime; -- cgit v1.2.3 From adcc8da8859bee9548bb6d323b1e8de8a7252acd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Apr 2018 09:15:39 -0700 Subject: sched/core: Simplify helpers for rq clock update skip requests By renaming the functions we can get rid of the skip parameter and have better code redability. It makes zero sense to have things such as: rq_clock_skip_update(rq, false) When the skip request is in fact not going to happen. Ever. Rename things such that we end up with: rq_clock_skip_update(rq) rq_clock_cancel_skipupdate(rq) Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: matt@codeblueprint.co.uk Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180404161539.nhadkff2aats74jh@linux-n805 Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 17 ++++++++++++----- 5 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b68995a417..550a07f648b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -874,7 +874,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d1c7bf7c7e5b..e7b3008b85bb 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0951d1c58d2f..54dc31e7ab9b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7089,7 +7089,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } set_skip_buddy(se); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad13e6242481..7aef6b4e885a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -861,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq_clock_skip_update(rq, false); + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c3deaee7a7a2..15750c222ca2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -static inline void rq_clock_skip_update(struct rq *rq, bool skip) +static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_held(&rq->lock); - if (skip) - rq->clock_update_flags |= RQCF_REQ_SKIP; - else - rq->clock_update_flags &= ~RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; +} + +/* + * See rt task throttoling, which is the only time a skip + * request is cancelled. + */ +static inline void rq_clock_cancel_skipupdate(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + rq->clock_update_flags &= ~RQCF_REQ_SKIP; } struct rq_flags { -- cgit v1.2.3 From 7303e30ec1d8fb5ca1f07c92d069241c32b2ee1b Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:03 +0200 Subject: syscalls/core: Prepare CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y for compat syscalls It may be useful for an architecture to override the definitions of the COMPAT_SYSCALL_DEFINE0() and __COMPAT_SYSCALL_DEFINEx() macros in , in particular to use a different calling convention for syscalls. This patch provides a mechanism to do so, based on the previously introduced CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled, is included in and may be used to define the macros mentioned above. Moreover, as the syscall calling convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set, the compat syscall function prototypes in are #ifndef'd out in that case. As some of the syscalls and/or compat syscalls may not be present, the COND_SYSCALL() and COND_SYSCALL_COMPAT() macros in kernel/sys_ni.c as well as the SYS_NI() and COMPAT_SYS_NI() macros in kernel/time/posix-stubs.c can be re-defined in iff CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-5-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- kernel/sys_ni.c | 10 ++++++++++ kernel/time/posix-stubs.c | 10 ++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6cafc008f6db..9791364925dc 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -5,6 +5,11 @@ #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */ +#include +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* we can't #include here, but tell gcc to not warn with -Wmissing-prototypes */ asmlinkage long sys_ni_syscall(void); @@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void) return -ENOSYS; } +#ifndef COND_SYSCALL #define COND_SYSCALL(name) cond_syscall(sys_##name) +#endif /* COND_SYSCALL */ + +#ifndef COND_SYSCALL_COMPAT #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) +#endif /* COND_SYSCALL_COMPAT */ /* * This list is kept in the same order as include/uapi/asm-generic/unistd.h. diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b258bee13b02..69a937c3cd81 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -19,6 +19,11 @@ #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* Architectures may override SYS_NI and COMPAT_SYS_NI */ +#include +#endif + asmlinkage long sys_ni_posix_timers(void) { pr_err_once("process %d (%s) attempted a POSIX timer syscall " @@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void) return -ENOSYS; } +#ifndef SYS_NI #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#endif + +#ifndef COMPAT_SYS_NI #define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) +#endif SYS_NI(timer_create); SYS_NI(timer_gettime); -- cgit v1.2.3 From 0211e12dd0a5385ecffd3557bc570dbad7fcf245 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Apr 2018 12:40:07 +0200 Subject: genirq/affinity: Don't return with empty affinity masks on error When the allocation of node_to_possible_cpumask fails, then irq_create_affinity_masks() returns with a pointer to the empty affinity masks array, which will cause malfunction. Reorder the allocations so the masks array allocation comes last and every failure path returns NULL. Fixes: 9a0ef98e186d ("genirq/affinity: Assign vectors to all present CPUs") Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Ming Lei --- kernel/irq/affinity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a37a3b4b6342..e0665549af59 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks; + struct cpumask *masks = NULL; cpumask_var_t nmsk, *node_to_possible_cpumask; /* @@ -121,13 +121,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto out; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); if (!node_to_possible_cpumask) - goto out; + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) @@ -192,8 +192,9 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); +outnodemsk: free_node_to_possible_cpumask(node_to_possible_cpumask); -out: +outcpumsk: free_cpumask_var(nmsk); return masks; } -- cgit v1.2.3 From 47778f33dcba7feb92031643b37e477892f82b62 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:55 +0800 Subject: genirq/affinity: Rename *node_to_possible_cpumask as *node_to_cpumask The following patches will introduce two stage irq spreading for improving irq spread on all possible CPUs. No functional change. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e0665549af59..272c968d9ef1 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_possible_cpumask(void) +static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_possible_cpumask(cpumask_var_t *masks) +static void free_node_to_cpumask(cpumask_var_t *masks) { int node; @@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_possible_cpumask(cpumask_var_t *masks) +static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; @@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_possible_cpumask; + cpumask_var_t nmsk, *node_to_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -121,8 +121,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); - if (!node_to_possible_cpumask) + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) goto outcpumsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_possible_cpumask(node_to_possible_cpumask); - nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, + build_node_to_cpumask(node_to_cpumask); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_possible_cpumask[n]); + node_to_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -193,7 +193,7 @@ done: for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); outnodemsk: - free_node_to_possible_cpumask(node_to_possible_cpumask); + free_node_to_cpumask(node_to_cpumask); outcpumsk: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3 From b3e6aaa8d94d618e685c4df08bef991a4fb43923 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:56 +0800 Subject: genirq/affinity: Move actual irq vector spreading into a helper function No functional change, just prepare for converting to 2-stage irq vector spreading. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 97 +++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 272c968d9ef1..a9c36904500c 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -/** - * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @nvecs: The total number of vectors - * @affd: Description of the affinity requirements - * - * Returns the masks pointer or NULL if allocation failed. - */ -struct cpumask * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { - int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; + int curvec = affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_cpumask; + int n, nodes, cpus_per_vec, extra_vecs; - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (!affv) - return NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto outcpumsk; - - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto outnodemsk; - - /* Fill out vectors at the beginning that don't need affinity */ - for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_cpumask(node_to_cpumask); - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, - &nodemsk); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the @@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (++curvec == last_affv) break; } - goto done; + goto out; } for_each_node_mask(n, nodemsk) { @@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) --nodes; } -done: +out: + return curvec - affd->pre_vectors; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +{ + cpumask_var_t nmsk, *node_to_cpumask; + struct cpumask *masks = NULL; + int curvec; + + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (nvecs == affd->pre_vectors + affd->post_vectors) + return NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, + cpu_possible_mask, nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ -- cgit v1.2.3 From 1a2d0914e23aab386f5d5acb689777e24151c2c8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:57 +0800 Subject: genirq/affinity: Allow irq spreading from a given starting point To support two stage irq vector spreading, it's required to add a starting point to the spreading function. No functional change, just preparatory work for the actual two stage change. [ tglx: Renamed variables, tidied up the code and massaged changelog ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-4-ming.lei@redhat.com --- kernel/irq/affinity.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a9c36904500c..213695a27ddb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { - int affv = nvecs - affd->pre_vectors - affd->post_vectors; - int last_affv = affv + affd->pre_vectors; - int curvec = affd->pre_vectors; + int n, nodes, cpus_per_vec, extra_vecs, done = 0; + int last_affv = affd->pre_vectors + numvecs; + int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; - int n, nodes, cpus_per_vec, extra_vecs; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); @@ -112,12 +112,13 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (affv <= nodes) { + if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, - node_to_cpumask[n]); - if (++curvec == last_affv) + cpumask_copy(masks + curvec, node_to_cpumask[n]); + if (++done == numvecs) break; + if (++curvec == last_affv) + curvec = affd->pre_vectors; } goto out; } @@ -126,7 +127,7 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); @@ -150,13 +151,16 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= last_affv) + done += v; + if (done >= numvecs) break; + if (curvec >= last_affv) + curvec = affd->pre_vectors; --nodes; } out: - return curvec - affd->pre_vectors; + return done; } /** @@ -169,9 +173,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { + int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; cpumask_var_t nmsk, *node_to_cpumask; struct cpumask *masks = NULL; - int curvec; /* * If there aren't any vectors left after applying the pre/post @@ -198,8 +202,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, - cpu_possible_mask, nmsk, masks); + curvec += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_possible_mask, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ -- cgit v1.2.3 From d3056812e7dfe6bf4f8ad9e397a9116dd5d32d15 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:58 +0800 Subject: genirq/affinity: Spread irq vectors among present CPUs as far as possible Commit 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") tried to spread the interrupts accross all possible CPUs to make sure that in case of phsyical hotplug (e.g. virtualization) the CPUs which get plugged in after the device was initialized are targeted by a hardware queue and the corresponding interrupt. This has a downside in cases where the ACPI tables claim that there are more possible CPUs than present CPUs and the number of interrupts to spread out is smaller than the number of possible CPUs. These bogus ACPI tables are unfortunately not uncommon. In such a case the vector spreading algorithm assigns interrupts to CPUs which can never be utilized and as a consequence these interrupts are unused instead of being mapped to present CPUs. As a result the performance of the device is suboptimal. To fix this spread the interrupt vectors in two stages: 1) Spread as many interrupts as possible among the present CPUs 2) Spread the remaining vectors among non present CPUs On a 8 core system, where CPU 0-3 are present and CPU 4-7 are not present, for a device with 4 queues the resulting interrupt affinity is: 1) Before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0 irq 40, cpu list 1 irq 41, cpu list 2 irq 42, cpu list 3 2) With 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0-2 irq 40, cpu list 3-4,6 irq 41, cpu list 5 irq 42, cpu list 7 3) With the refined vector spread applied: irq 39, cpu list 0,4 irq 40, cpu list 1,6 irq 41, cpu list 2,5 irq 42, cpu list 3,7 On a 8 core system, where all CPUs are present the resulting interrupt affinity for the 4 queues is: irq 39, cpu list 0,1 irq 40, cpu list 2,3 irq 41, cpu list 4,5 irq 42, cpu list 6,7 This is independent of the number of CPUs which are online at the point of initialization because in such a system the offline CPUs can be easily onlined afterwards, while in non-present CPUs need to be plugged physically or virtually which requires external interaction. The downside of this approach is that in case of physical hotplug the interrupt vector spreading might be suboptimal when CPUs 4-7 are physically plugged. Suboptimal from a NUMA point of view and due to the single target nature of interrupt affinities the later plugged CPUs might not be targeted by interrupts at all. Though, physical hotplug systems are not the common case while the broken ACPI table disease is wide spread. So it's preferred to have as many interrupts as possible utilized at the point where the device is initialized. Block multi-queue devices like NVME create a hardware queue per possible CPU, so the goal of commit 84676c1f21 to assign one interrupt vector per possible CPU is still achieved even with physical/virtual hotplug. [ tglx: Changed from online to present CPUs for the first spreading stage, renamed variables for readability sake, added comments and massaged changelog ] Reported-by: Laurence Oberman Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-5-ming.lei@redhat.com --- kernel/irq/affinity.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 213695a27ddb..f4f29b9d90ee 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + if (!cpumask_weight(cpu_mask)) + return 0; + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* @@ -173,8 +176,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - cpumask_var_t nmsk, *node_to_cpumask; + int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + int curvec, usedvecs; + cpumask_var_t nmsk, npresmsk, *node_to_cpumask; struct cpumask *masks = NULL; /* @@ -187,9 +191,12 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto outcpumsk; + node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) - goto outcpumsk; + goto outnpresmsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) @@ -202,16 +209,40 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, cpu_possible_mask, - nmsk, masks); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); + outnodemsk: free_node_to_cpumask(node_to_cpumask); +outnpresmsk: + free_cpumask_var(npresmsk); outcpumsk: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3 From 621b6d2ea297d0fb6030452c5bcd221f12165fcf Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 9 Apr 2018 19:03:46 +0900 Subject: perf/core: Fix use-after-free in uprobe_perf_close() A use-after-free bug was caught by KASAN while running usdt related code (BCC project. bcc/tests/python/test_usdt2.py): ================================================================== BUG: KASAN: use-after-free in uprobe_perf_close+0x222/0x3b0 Read of size 4 at addr ffff880384f9b4a4 by task test_usdt2.py/870 CPU: 4 PID: 870 Comm: test_usdt2.py Tainted: G W 4.16.0-next-20180409 #215 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0xc7/0x15b ? show_regs_print_info+0x5/0x5 ? printk+0x9c/0xc3 ? kmsg_dump_rewind_nolock+0x6e/0x6e ? uprobe_perf_close+0x222/0x3b0 print_address_description+0x83/0x3a0 ? uprobe_perf_close+0x222/0x3b0 kasan_report+0x1dd/0x460 ? uprobe_perf_close+0x222/0x3b0 uprobe_perf_close+0x222/0x3b0 ? probes_open+0x180/0x180 ? free_filters_list+0x290/0x290 trace_uprobe_register+0x1bb/0x500 ? perf_event_attach_bpf_prog+0x310/0x310 ? probe_event_disable+0x4e0/0x4e0 perf_uprobe_destroy+0x63/0xd0 _free_event+0x2bc/0xbd0 ? lockdep_rcu_suspicious+0x100/0x100 ? ring_buffer_attach+0x550/0x550 ? kvm_sched_clock_read+0x1a/0x30 ? perf_event_release_kernel+0x3e4/0xc00 ? __mutex_unlock_slowpath+0x12e/0x540 ? wait_for_completion+0x430/0x430 ? lock_downgrade+0x3c0/0x3c0 ? lock_release+0x980/0x980 ? do_raw_spin_trylock+0x118/0x150 ? do_raw_spin_unlock+0x121/0x210 ? do_raw_spin_trylock+0x150/0x150 perf_event_release_kernel+0x5d4/0xc00 ? put_event+0x30/0x30 ? fsnotify+0xd2d/0xea0 ? sched_clock_cpu+0x18/0x1a0 ? __fsnotify_update_child_dentry_flags.part.0+0x1b0/0x1b0 ? pvclock_clocksource_read+0x152/0x2b0 ? pvclock_read_flags+0x80/0x80 ? kvm_sched_clock_read+0x1a/0x30 ? sched_clock_cpu+0x18/0x1a0 ? pvclock_clocksource_read+0x152/0x2b0 ? locks_remove_file+0xec/0x470 ? pvclock_read_flags+0x80/0x80 ? fcntl_setlk+0x880/0x880 ? ima_file_free+0x8d/0x390 ? lockdep_rcu_suspicious+0x100/0x100 ? ima_file_check+0x110/0x110 ? fsnotify+0xea0/0xea0 ? kvm_sched_clock_read+0x1a/0x30 ? rcu_note_context_switch+0x600/0x600 perf_release+0x21/0x40 __fput+0x264/0x620 ? fput+0xf0/0xf0 ? do_raw_spin_unlock+0x121/0x210 ? do_raw_spin_trylock+0x150/0x150 ? SyS_fchdir+0x100/0x100 ? fsnotify+0xea0/0xea0 task_work_run+0x14b/0x1e0 ? task_work_cancel+0x1c0/0x1c0 ? copy_fd_bitmaps+0x150/0x150 ? vfs_read+0xe5/0x260 exit_to_usermode_loop+0x17b/0x1b0 ? trace_event_raw_event_sys_exit+0x1a0/0x1a0 do_syscall_64+0x3f6/0x490 ? syscall_return_slowpath+0x2c0/0x2c0 ? lockdep_sys_exit+0x1f/0xaa ? syscall_return_slowpath+0x1a3/0x2c0 ? lockdep_sys_exit+0x1f/0xaa ? prepare_exit_to_usermode+0x11c/0x1e0 ? enter_from_user_mode+0x30/0x30 random: crng init done ? __put_user_4+0x1c/0x30 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x7f41d95f9340 RSP: 002b:00007fffe71e4268 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 000000000000000d RCX: 00007f41d95f9340 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 000000000000000d RBP: 0000000000000000 R08: 00007f41ca8ff700 R09: 00007f41d996dd1f R10: 00007fffe71e41e0 R11: 0000000000000246 R12: 00007fffe71e4330 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007fffe71e4290 Allocated by task 870: kasan_kmalloc+0xa0/0xd0 kmem_cache_alloc_node+0x11a/0x430 copy_process.part.19+0x11a0/0x41c0 _do_fork+0x1be/0xa20 do_syscall_64+0x198/0x490 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Freed by task 0: __kasan_slab_free+0x12e/0x180 kmem_cache_free+0x102/0x4d0 free_task+0xfe/0x160 __put_task_struct+0x189/0x290 delayed_put_task_struct+0x119/0x250 rcu_process_callbacks+0xa6c/0x1b60 __do_softirq+0x238/0x7ae The buggy address belongs to the object at ffff880384f9b480 which belongs to the cache task_struct of size 12928 It occurs because task_struct is freed before perf_event which refers to the task and task flags are checked while teardown of the event. perf_event_alloc() assigns task_struct to hw.target of perf_event, but there is no reference counting for it. As a fix we get_task_struct() in perf_event_alloc() at above mentioned assignment and put_task_struct() in _free_event(). Signed-off-by: Prashant Bhole Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 63b6da39bb38e8f1a1ef3180d32a39d6 ("perf: Fix perf_event_exit_task() race") Link: http://lkml.kernel.org/r/20180409100346.6416-1-bhole_prashant_q7@lab.ntt.co.jp Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fc1c330c6bd6..d7af82827373 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->hw.target) + put_task_struct(event->hw.target); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -9955,6 +9958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ + get_task_struct(task); event->hw.target = task; } @@ -10070,6 +10074,8 @@ err_ns: perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); + if (event->hw.target) + put_task_struct(event->hw.target); kfree(event); return ERR_PTR(err); -- cgit v1.2.3 From 5da13ab8b0dcaa984c45ae43edf5a4d148603d42 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 9 Apr 2018 21:16:54 +0900 Subject: perf/core: Fix perf_kprobe_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix error handling in perf_kprobe_init(): ================================================================== BUG: KASAN: slab-out-of-bounds in strlen+0x8e/0xa0 lib/string.c:482 Read of size 1 at addr ffff88003f9cc5c0 by task syz-executor2/23095 CPU: 0 PID: 23095 Comm: syz-executor2 Not tainted 4.16.0+ #24 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xca/0x13e lib/dump_stack.c:113 print_address_description+0x6e/0x2c0 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report+0x256/0x380 mm/kasan/report.c:412 strlen+0x8e/0xa0 lib/string.c:482 kstrdup+0x21/0x70 mm/util.c:55 alloc_trace_kprobe+0xc8/0x930 kernel/trace/trace_kprobe.c:325 create_local_trace_kprobe+0x4f/0x3a0 kernel/trace/trace_kprobe.c:1438 perf_kprobe_init+0x149/0x1f0 kernel/trace/trace_event_perf.c:264 perf_kprobe_event_init+0xa8/0x120 kernel/events/core.c:8407 perf_try_init_event+0xcb/0x2a0 kernel/events/core.c:9719 perf_init_event kernel/events/core.c:9750 [inline] perf_event_alloc+0x1367/0x1e20 kernel/events/core.c:10022 SYSC_perf_event_open+0x242/0x2330 kernel/events/core.c:10477 do_syscall_64+0x198/0x640 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Reported-by: 范龙飞 Signed-off-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Song Liu Cc: Thomas Gleixner Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 2c416509b834..94600f1f7efa 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) ret = strncpy_from_user( func, u64_to_user_ptr(p_event->attr.kprobe_func), KSYM_NAME_LEN); + if (ret == KSYM_NAME_LEN) + ret = -E2BIG; if (ret < 0) goto out; -- cgit v1.2.3 From 0eadcc7a7bc03e991d2da1cf88143fb7cc0342c1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 9 Apr 2018 18:31:30 +0000 Subject: perf/core: Fix perf_uprobe_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similarly to the uprobe PMU fix in perf_kprobe_init(), fix error handling in perf_uprobe_init() as well. Reported-by: 范龙飞 Signed-off-by: Song Liu Acked-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 94600f1f7efa..c79193e598f5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -302,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) return -ENOMEM; ret = strncpy_from_user( path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); + if (ret == PATH_MAX) + return -E2BIG; if (ret < 0) goto out; if (path[0] == '\0') { -- cgit v1.2.3 From 50268a3d266ecfdd6c5873d62b2758d9732fc598 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Apr 2018 21:20:08 +0900 Subject: tracing/uprobe_event: Fix strncpy corner case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix string fetch function to terminate with NUL. It is OK to drop the rest of string. Signed-off-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Song Liu Cc: Thomas Gleixner Cc: security@kernel.org Cc: 范龙飞 Fixes: 5baaa59ef09e ("tracing/probes: Implement 'memory' fetch method for uprobes") Signed-off-by: Ingo Molnar --- kernel/trace/trace_uprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2014f4351ae0..0d450b40988e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,6 +151,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, return; ret = strncpy_from_user(dst, src, maxlen); + if (ret == maxlen) + dst[--ret] = '\0'; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; -- cgit v1.2.3 From 0a4d0564f0fc377ad0ba66ce214b9b16324aea4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= Date: Sun, 15 Oct 2017 21:22:49 -0400 Subject: tracing: Use ARRAY_SIZE() macro instead of open coding it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useless to re-invent the ARRAY_SIZE macro so let's use it instead of DATA_CNT. Found with Coccinelle with the following semantic patch: @r depends on (org || report)@ type T; T[] E; position p; @@ ( (sizeof(E)@p /sizeof(*E)) | (sizeof(E)@p /sizeof(E[...])) | (sizeof(E)@p /sizeof(T)) ) Link: http://lkml.kernel.org/r/20171016012250.26453-1-jeremy.lefaure@lse.epita.fr Signed-off-by: Jérémy Lefaure [ Removed useless include of kernel.h ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1bda4ec95e18..5eba1cec945c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2140,7 +2140,7 @@ static struct test_filter_data_t { #undef YES #undef NO -#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) +#define DATA_CNT ARRAY_SIZE(test_filter_data) static int test_pred_visited; -- cgit v1.2.3 From f0a2aa5a2a406d0a57aa9b320ffaa5538672b6c5 Mon Sep 17 00:00:00 2001 From: Howard McLauchlan Date: Tue, 10 Apr 2018 16:10:30 -0700 Subject: tracing/uprobe: Add support for overlayfs uprobes cannot successfully attach to binaries located in a directory mounted with overlayfs. To verify, create directories for mounting overlayfs (upper,lower,work,merge), move some binary into merge/ and use readelf to obtain some known instruction of the binary. I used /bin/true and the entry instruction(0x13b0): $ mount -t overlay overlay -o lowerdir=lower,upperdir=upper,workdir=work merge $ cd /sys/kernel/debug/tracing $ echo 'p:true_entry PATH_TO_MERGE/merge/true:0x13b0' > uprobe_events $ echo 1 > events/uprobes/true_entry/enable This returns 'bash: echo: write error: Input/output error' and dmesg tells us 'event trace: Could not enable event true_entry' This change makes create_trace_uprobe() look for the real inode of a dentry. In the case of normal filesystems, this simplifies to just returning the inode. In the case of overlayfs(and similar fs) we will obtain the underlying dentry and corresponding inode, upon which uprobes can successfully register. Running the example above with the patch applied, we can see that the uprobe is enabled and will output to trace as expected. Link: http://lkml.kernel.org/r/20180410231030.2720-1-hmclauchlan@fb.com Reviewed-by: Josef Bacik Reviewed-by: Masami Hiramatsu Reviewed-by: Srikar Dronamraju Signed-off-by: Howard McLauchlan Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029ae1be6..8b86d76c55ee 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -446,7 +446,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(d_inode(path.dentry)); + inode = igrab(d_real_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { -- cgit v1.2.3 From 18d45b11d96e6f9b3814960a1394083a3d6b7f74 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Mar 2018 13:57:55 +0530 Subject: trace_uprobe: Use %lx to display offset tu->offset is unsigned long, not a pointer, thus %lx should be used to print it, not the %px. Link: http://lkml.kernel.org/r/20180315082756.9050-1-ravi.bangoria@linux.vnet.ibm.com Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Fixes: 0e4d819d0893 ("trace_uprobe: Display correct offset in uprobe_events") Suggested-by: Kees Cook Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b86d76c55ee..d7d3c9237f64 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%px", (void *)tu->offset); + seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset); } else { switch (sizeof(void *)) { case 4: -- cgit v1.2.3 From a64b2c01e67eff8b8d0d438507fd0346290697cf Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Mar 2018 13:57:56 +0530 Subject: trace_uprobe: Simplify probes_seq_show() Simplify probes_seq_show() function. No change in output before and after patch. Link: http://lkml.kernel.org/r/20180315082756.9050-2-ravi.bangoria@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d7d3c9237f64..21604754bb79 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -602,24 +602,9 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, - trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:", tu->filename); - - /* Don't print "0x (null)" when offset is 0 */ - if (tu->offset) { - seq_printf(m, "0x%0*lx", (int)(sizeof(void *) * 2), tu->offset); - } else { - switch (sizeof(void *)) { - case 4: - seq_printf(m, "0x00000000"); - break; - case 8: - default: - seq_printf(m, "0x0000000000000000"); - break; - } - } + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system, + trace_event_name(&tu->tp.call), tu->filename, + (int)(sizeof(void *) * 2), tu->offset); for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); -- cgit v1.2.3 From 0b3dec05dbbce023f4f25aba975b5d253c313ebb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 11 Apr 2018 10:59:46 -0400 Subject: tracing: Enforce passing in filter=NULL to create_filter() There's some inconsistency with what to set the output parameter filterp when passing to create_filter(..., struct event_filter **filterp). Whatever filterp points to, should be NULL when calling this function. The create_filter() calls create_filter_start() with a pointer to a local "filter" variable that is set to NULL. The create_filter_start() has a WARN_ON() if the passed in pointer isn't pointing to a value set to NULL. Ideally, create_filter() should pass the filterp variable it received to create_filter_start() and not hide it as with a local variable, this allowed create_filter() to fail, and not update the passed in filter, and the caller of create_filter() then tried to free filter, which was never initialized to anything, causing memory corruption. Link: http://lkml.kernel.org/r/00000000000032a0c30569916870@google.com Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Reported-by: syzbot+dadcc936587643d7f568@syzkaller.appspotmail.com Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5eba1cec945c..9b4716bb8bb0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1704,18 +1704,16 @@ static int create_filter(struct trace_event_call *call, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; - struct event_filter *filter = NULL; int err; - err = create_filter_start(filter_string, set_str, &pe, &filter); + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; - err = process_preds(call, filter_string, filter, pe); + err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, filter); + append_filter_err(pe, *filterp); - *filterp = filter; return err; } @@ -1739,24 +1737,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir, struct trace_array *tr, char *filter_str, struct event_filter **filterp) { - struct event_filter *filter = NULL; struct filter_parse_error *pe = NULL; int err; - err = create_filter_start(filter_str, true, &pe, &filter); + err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { err = process_system_preds(dir, tr, pe, filter_str); if (!err) { /* System filters just show a default message */ - kfree(filter->filter_string); - filter->filter_string = NULL; + kfree((*filterp)->filter_string); + (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, filter); + append_filter_err(pe, *filterp); } } create_filter_finish(pe); - *filterp = filter; return err; } @@ -1764,7 +1760,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, int apply_event_filter(struct trace_event_file *file, char *filter_string) { struct trace_event_call *call = file->event_call; - struct event_filter *filter; + struct event_filter *filter = NULL; int err; if (!strcmp(strstrip(filter_string), "0")) { @@ -1817,7 +1813,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, { struct event_subsystem *system = dir->subsystem; struct trace_array *tr = dir->tr; - struct event_filter *filter; + struct event_filter *filter = NULL; int err = 0; mutex_lock(&event_mutex); @@ -2024,7 +2020,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { int err; - struct event_filter *filter; + struct event_filter *filter = NULL; struct trace_event_call *call; mutex_lock(&event_mutex); -- cgit v1.2.3 From 32e6e967fb36bf77ed99221ae3ce1909f045d8f9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 11 Apr 2018 18:02:37 +0000 Subject: perf/core: Need CAP_SYS_ADMIN to create k/uprobe with perf_event_open() Non-root user cannot create kprobe or uprobe through the text-based interface (kprobe_events, uprobe_events),so they should not be able to create probes via perf_event_open() either. Reported-by: Vince Weaver Signed-off-by: Song Liu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU") Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Link: http://lkml.kernel.org/r/C0B2EFB5-C403-4BDB-9046-C14B3EE66999@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7af82827373..2d5fe26551f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8400,6 +8400,10 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ @@ -8437,6 +8441,10 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* * no branch sampling for probe events */ -- cgit v1.2.3 From 60bb83b81169820c691fbfa33a6a4aef32aa4b0b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 13 Apr 2018 15:35:13 -0700 Subject: resource: fix integer overflow at reallocation We've got a bug report indicating a kernel panic at booting on an x86-32 system, and it turned out to be the invalid PCI resource assigned after reallocation. __find_resource() first aligns the resource start address and resets the end address with start+size-1 accordingly, then checks whether it's contained. Here the end address may overflow the integer, although resource_contains() still returns true because the function validates only start and end address. So this ends up with returning an invalid resource (start > end). There was already an attempt to cover such a problem in the commit 47ea91b4052d ("Resource: fix wrong resource window calculation"), but this case is an overseen one. This patch adds the validity check of the newly calculated resource for avoiding the integer overflow problem. Bugzilla: http://bugzilla.opensuse.org/show_bug.cgi?id=1086739 Link: http://lkml.kernel.org/r/s5hpo37d5l8.wl-tiwai@suse.de Fixes: 23c570a67448 ("resource: ability to resize an allocated resource") Signed-off-by: Takashi Iwai Reported-by: Michael Henders Tested-by: Michael Henders Reviewed-by: Andrew Morton Cc: Ram Pai Cc: Bjorn Helgaas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index e270b5048988..2af6c03858b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -651,7 +651,8 @@ static int __find_resource(struct resource *root, struct resource *old, alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { + if (alloc.start <= alloc.end && + resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; -- cgit v1.2.3 From 1cbf29da3628b661379acba7b08a07ef1e5da3b5 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 13 Apr 2018 15:35:34 -0700 Subject: kexec: export PG_swapbacked to VMCOREINFO Since commit 6326fec1122c ("mm: Use owner_priv bit for PageSwapCache, valid when PageSwapBacked"), PG_swapcache is an alias for PG_owner_priv_1, which may be also used for other purposes. To know whether the bit indeed has the PG_swapcache meaning, it is necessary to check PG_swapbacked, hence this bit must be exported. Link: http://lkml.kernel.org/r/20180410161345.142e142d@ezekiel.suse.cz Signed-off-by: Petr Tesarik Reviewed-by: Andrew Morton Cc: Dave Young Cc: Xunlei Pang Cc: Baoquan He Cc: Hari Bathini Cc: "Kirill A. Shutemov" Cc: "Marc-Andr Lureau" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a93590cdd9e1..f7674d676889 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -454,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_swapbacked); VMCOREINFO_NUMBER(PG_slab); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); -- cgit v1.2.3 From b799a09f639beeda105fe8a9ab440d80fdabd3b3 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:35:45 -0700 Subject: kexec_file: make use of purgatory optional Patch series "kexec_file, x86, powerpc: refactoring for other architecutres", v2. This is a preparatory patchset for adding kexec_file support on arm64. It was originally included in a arm64 patch set[1], but Philipp is also working on their kexec_file support on s390[2] and some changes are now conflicting. So these common parts were extracted and put into a separate patch set for better integration. What's more, my original patch#4 was split into a few small chunks for easier review after Dave's comment. As such, the resulting code is basically identical with my original, and the only *visible* differences are: - renaming of _kexec_kernel_image_probe() and _kimage_file_post_load_cleanup() - change one of types of arguments at prepare_elf64_headers() Those, unfortunately, require a couple of trivial changes on the rest (#1, #6 to #13) of my arm64 kexec_file patch set[1]. Patch #1 allows making a use of purgatory optional, particularly useful for arm64. Patch #2 commonalizes arch_kexec_kernel_{image_probe, image_load, verify_sig}() and arch_kimage_file_post_load_cleanup() across architectures. Patches #3-#7 are also intended to generalize parse_elf64_headers(), along with exclude_mem_range(), to be made best re-use of. [1] http://lists.infradead.org/pipermail/linux-arm-kernel/2018-February/561182.html [2] http://lkml.iu.edu//hypermail/linux/kernel/1802.1/02596.html This patch (of 7): On arm64, crash dump kernel's usable memory is protected by *unmapping* it from kernel virtual space unlike other architectures where the region is just made read-only. It is highly unlikely that the region is accidentally corrupted and this observation rationalizes that digest check code can also be dropped from purgatory. The resulting code is so simple as it doesn't require a bit ugly re-linking/relocation stuff, i.e. arch_kexec_apply_relocations_add(). Please see: http://lists.infradead.org/pipermail/linux-arm-kernel/2017-December/545428.html All that the purgatory does is to shuffle arguments and jump into a new kernel, while we still need to have some space for a hash value (purgatory_sha256_digest) which is never checked against. As such, it doesn't make sense to have trampline code between old kernel and new kernel on arm64. This patch introduces a new configuration, ARCH_HAS_KEXEC_PURGATORY, and allows related code to be compiled in only if necessary. [takahiro.akashi@linaro.org: fix trivial screwup] Link: http://lkml.kernel.org/r/20180309093346.GF25863@linaro.org Link: http://lkml.kernel.org/r/20180306102303.9063-2-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e5bcd94c1efb..ab1dced677fd 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -532,6 +532,9 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; + if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + return 0; + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; @@ -633,6 +636,7 @@ out: return ret; } +#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY /* Actually load purgatory. Lot of code taken from kexec-tools */ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, unsigned long max, int top_down) @@ -1022,3 +1026,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ -- cgit v1.2.3 From 9ec4ecef0af7790551109283ca039a7c52de343c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:35:49 -0700 Subject: kexec_file,x86,powerpc: factor out kexec_file_ops functions As arch_kexec_kernel_image_{probe,load}(), arch_kimage_file_post_load_cleanup() and arch_kexec_kernel_verify_sig() are almost duplicated among architectures, they can be commonalized with an architecture-defined kexec_file_ops array. So let's factor them out. Link: http://lkml.kernel.org/r/20180306102303.9063-3-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Cc: Michael Ellerman Cc: Thiago Jung Bauermann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ab1dced677fd..332c4fd12cb1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -28,28 +28,80 @@ static int kexec_calculate_store_digests(struct kimage *image); +/* + * Currently this is the only default function that is exported as some + * architectures need it to do additional handlings. + * In the future, other default functions may be exported too if required. + */ +int kexec_image_probe_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + const struct kexec_file_ops * const *fops; + int ret = -ENOEXEC; + + for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) { + ret = (*fops)->probe(buf, buf_len); + if (!ret) { + image->fops = *fops; + return ret; + } + } + + return ret; +} + /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { - return -ENOEXEC; + return kexec_image_probe_default(image, buf, buf_len); +} + +static void *kexec_image_load_default(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); } void * __weak arch_kexec_kernel_image_load(struct kimage *image) { - return ERR_PTR(-ENOEXEC); + return kexec_image_load_default(image); +} + +static int kexec_image_post_load_cleanup_default(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); } int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { - return -EINVAL; + return kexec_image_post_load_cleanup_default(image); } #ifdef CONFIG_KEXEC_VERIFY_SIG +static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { - return -EKEYREJECTED; + return kexec_image_verify_sig_default(image, buf, buf_len); } #endif -- cgit v1.2.3 From babac4a84a88842bec477a5bdada1460f3bc374c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 13 Apr 2018 15:36:06 -0700 Subject: kexec_file, x86: move re-factored code to generic side In the previous patches, commonly-used routines, exclude_mem_range() and prepare_elf64_headers(), were carved out. Now place them in kexec common code. A prefix "crash_" is given to each of their names to avoid possible name collisions. Link: http://lkml.kernel.org/r/20180306102303.9063-8-takahiro.akashi@linaro.org Signed-off-by: AKASHI Takahiro Acked-by: Dave Young Tested-by: Dave Young Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 332c4fd12cb1..b06b1fac5252 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -22,6 +22,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include "kexec_internal.h" @@ -1079,3 +1084,173 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } #endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + phdr++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + *addr = buf; + *sz = elf_sz; + return 0; +} -- cgit v1.2.3 From d2b8178ca7324a21495cb71049b4e4a041ab5942 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:13 -0700 Subject: kernel/kexec_file.c: remove checks in kexec_purgatory_load Before the purgatory is loaded several checks are done whether the ELF file in kexec_purgatory is valid or not. These checks are incomplete. For example they don't check for the total size of the sections defined in the section header table or if the entry point actually points into the purgatory. On the other hand the purgatory, although an ELF file on its own, is part of the kernel. Thus not trusting the purgatory means not trusting the kernel build itself. So remove all validity checks on the purgatory and just trust the kernel build. Link: http://lkml.kernel.org/r/20180321112751.22196-3-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b06b1fac5252..81ba4f782486 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -941,22 +941,8 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - if (kexec_purgatory_size < sizeof(Elf_Ehdr)) - return -ENOEXEC; - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; - if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 - || pi->ehdr->e_type != ET_REL - || !elf_check_arch(pi->ehdr) - || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) - return -ENOEXEC; - - if (pi->ehdr->e_shoff >= kexec_purgatory_size - || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > - kexec_purgatory_size - pi->ehdr->e_shoff)) - return -ENOEXEC; - ret = __kexec_load_purgatory(image, min, max, top_down); if (ret) return ret; -- cgit v1.2.3 From 65c225d3280542f3ea145e052215ce0538f6bb69 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:17 -0700 Subject: kernel/kexec_file.c: make purgatory_info->ehdr const The kexec_purgatory buffer is read-only. Thus all pointers into kexec_purgatory are read-only, too. Point this out by explicitly marking purgatory_info->ehdr as 'const' and update the comments in purgatory_info. Link: http://lkml.kernel.org/r/20180321112751.22196-4-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 81ba4f782486..12cf9c9ff0bc 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -941,7 +941,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (kexec_purgatory_size <= 0) return -EINVAL; - pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; ret = __kexec_load_purgatory(image, min, max, top_down); if (ret) @@ -965,9 +965,9 @@ out: static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, const char *name) { + const Elf_Ehdr *ehdr; Elf_Sym *syms; Elf_Shdr *sechdrs; - Elf_Ehdr *ehdr; int i, k; const char *strtab; -- cgit v1.2.3 From 961d921a1b967f76e13f9e11f2b0c2bcb5741f10 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:21 -0700 Subject: kernel/kexec_file.c: search symbols in read-only kexec_purgatory The stripped purgatory does not contain a symtab. So when looking for symbols this is done in read-only kexec_purgatory. Highlight this by marking the corresponding variables as 'const'. Link: http://lkml.kernel.org/r/20180321112751.22196-5-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 12cf9c9ff0bc..9bd1ec1dd875 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -962,20 +962,27 @@ out: return ret; } -static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) +/* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) { + const Elf_Shdr *sechdrs; const Elf_Ehdr *ehdr; - Elf_Sym *syms; - Elf_Shdr *sechdrs; - int i, k; + const Elf_Sym *syms; const char *strtab; + int i, k; - if (!pi->sechdrs || !pi->ehdr) + if (!pi->ehdr) return NULL; - sechdrs = pi->sechdrs; ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; for (i = 0; i < ehdr->e_shnum; i++) { if (sechdrs[i].sh_type != SHT_SYMTAB) @@ -984,8 +991,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, if (sechdrs[i].sh_link >= ehdr->e_shnum) /* Invalid strtab section number */ continue; - strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (Elf_Sym *)sechdrs[i].sh_offset; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; /* Go through symbols for a match */ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { @@ -1013,7 +1020,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; - Elf_Sym *sym; + const Elf_Sym *sym; Elf_Shdr *sechdr; sym = kexec_purgatory_find_symbol(pi, name); @@ -1036,9 +1043,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value) { - Elf_Sym *sym; - Elf_Shdr *sechdrs; struct purgatory_info *pi = &image->purgatory_info; + const Elf_Sym *sym; + Elf_Shdr *sec; char *sym_buf; sym = kexec_purgatory_find_symbol(pi, name); @@ -1051,16 +1058,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sechdrs = pi->sechdrs; + sec = pi->sechdrs + sym->st_shndx; - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sec->sh_type == SHT_NOBITS) { pr_err("symbol %s is in a bss section. Cannot %s\n", name, get_value ? "get" : "set"); return -EINVAL; } - sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + - sym->st_value; + sym_buf = (char *)sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); -- cgit v1.2.3 From 8aec395b8478310521031157ef5d44ef19c2c581 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:24 -0700 Subject: kernel/kexec_file.c: use read-only sections in arch_kexec_apply_relocations* When the relocations are applied to the purgatory only the section the relocations are applied to is writable. The other sections, i.e. the symtab and .rel/.rela, are in read-only kexec_purgatory. Highlight this by marking the corresponding variables as 'const'. While at it also change the signatures of arch_kexec_apply_relocations* to take section pointers instead of just the index of the relocation section. This removes the second lookup and sanity check of the sections in arch code. Link: http://lkml.kernel.org/r/20180321112751.22196-6-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 63 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 9bd1ec1dd875..5c70f7f2bae3 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -110,19 +110,35 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, } #endif -/* Apply relocations of type RELA */ +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } -/* Apply relocations of type REL */ +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ int __weak -arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, - unsigned int relsec) +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; @@ -879,14 +895,19 @@ static int kexec_apply_relocations(struct kimage *image) { int i, ret; struct purgatory_info *pi = &image->purgatory_info; - Elf_Shdr *sechdrs = pi->sechdrs; + const Elf_Shdr *sechdrs; + + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - /* Apply relocations */ for (i = 0; i < pi->ehdr->e_shnum; i++) { - Elf_Shdr *section, *symtab; + const Elf_Shdr *relsec; + const Elf_Shdr *symtab; + Elf_Shdr *section; + + relsec = sechdrs + i; - if (sechdrs[i].sh_type != SHT_RELA && - sechdrs[i].sh_type != SHT_REL) + if (relsec->sh_type != SHT_RELA && + relsec->sh_type != SHT_REL) continue; /* @@ -895,12 +916,12 @@ static int kexec_apply_relocations(struct kimage *image) * symbol table. And ->sh_info contains section header * index of section to which relocations apply. */ - if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || - sechdrs[i].sh_link >= pi->ehdr->e_shnum) + if (relsec->sh_info >= pi->ehdr->e_shnum || + relsec->sh_link >= pi->ehdr->e_shnum) return -ENOEXEC; - section = &sechdrs[sechdrs[i].sh_info]; - symtab = &sechdrs[sechdrs[i].sh_link]; + section = pi->sechdrs + relsec->sh_info; + symtab = sechdrs + relsec->sh_link; if (!(section->sh_flags & SHF_ALLOC)) continue; @@ -917,12 +938,12 @@ static int kexec_apply_relocations(struct kimage *image) * Respective architecture needs to provide support for applying * relocations of type SHT_RELA/SHT_REL. */ - if (sechdrs[i].sh_type == SHT_RELA) - ret = arch_kexec_apply_relocations_add(pi->ehdr, - sechdrs, i); - else if (sechdrs[i].sh_type == SHT_REL) - ret = arch_kexec_apply_relocations(pi->ehdr, - sechdrs, i); + if (relsec->sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi, section, + relsec, symtab); + else if (relsec->sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi, section, + relsec, symtab); if (ret) return ret; } -- cgit v1.2.3 From 930457057abe4e6d57433dea75e97e0e39fd0ab6 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:28 -0700 Subject: kernel/kexec_file.c: split up __kexec_load_puragory When inspecting __kexec_load_purgatory you find that it has two tasks 1) setting up the kexec_buffer for the new kernel and, 2) setting up pi->sechdrs for the final load address. The two tasks are independent of each other. To improve readability split up __kexec_load_purgatory into two functions, one for each task, and call them directly from kexec_load_purgatory. Link: http://lkml.kernel.org/r/20180321112751.22196-7-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 200 +++++++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 5c70f7f2bae3..878b97bd3067 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -710,39 +710,97 @@ out: } #ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY -/* Actually load purgatory. Lot of code taken from kexec-tools */ -static int __kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down) +/* + * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer to setup. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, + struct kexec_buf *kbuf) { - struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, bss_align, bss_sz, bss_pad; - unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; - unsigned char *buf_addr, *src; - int i, ret = 0, entry_sidx = -1; - const Elf_Shdr *sechdrs_c; - Elf_Shdr *sechdrs = NULL; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; + const Elf_Shdr *sechdrs; + unsigned long bss_align; + unsigned long bss_sz; + unsigned long align; + int i, ret; - /* - * sechdrs_c points to section headers in purgatory and are read - * only. No modifications allowed. - */ - sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + bss_align = 1; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (kbuf->buf_align < align) + kbuf->buf_align = align; + kbuf->bufsz = ALIGN(kbuf->bufsz, align); + kbuf->bufsz += sechdrs[i].sh_size; + } else { + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align); + kbuf->memsz = kbuf->bufsz + bss_sz; + if (kbuf->buf_align < bss_align) + kbuf->buf_align = bss_align; + + kbuf->buffer = vzalloc(kbuf->bufsz); + if (!kbuf->buffer) + return -ENOMEM; + pi->purgatory_buf = kbuf->buffer; + + ret = kexec_add_buffer(kbuf); + if (ret) + goto out; + pi->purgatory_load_addr = kbuf->mem; + + return 0; +out: + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + return ret; +} + +/* + * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer. + * @pi: Purgatory to be loaded. + * @kbuf: Buffer prepared to store purgatory. + * + * Allocates the memory needed for the buffer. Caller is responsible to free + * the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, + struct kexec_buf *kbuf) +{ + unsigned long curr_load_addr; + unsigned long load_addr; + unsigned long bss_addr; + unsigned long offset; + unsigned char *buf_addr; + unsigned char *src; + Elf_Shdr *sechdrs; + int entry_sidx = -1; + int i; - /* - * We can not modify sechdrs_c[] and its fields. It is read only. - * Copy it over to a local copy where one can store some temporary - * data and free it at the end. We need to modify ->sh_addr and - * ->sh_offset fields to keep track of permanent and temporary - * locations of sections. - */ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); if (!sechdrs) return -ENOMEM; - - memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, + pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + pi->sechdrs = sechdrs; /* * We seem to have multiple copies of sections. First copy is which @@ -770,7 +828,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, * Identify entry point section and make entry relative to section * start. */ - entry = pi->ehdr->e_entry; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; @@ -783,63 +841,19 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > pi->ehdr->e_entry)) { entry_sidx = i; - entry -= sechdrs[i].sh_addr; + kbuf->image->start -= sechdrs[i].sh_addr; break; } } - /* Determine how much memory is needed to load relocatable object. */ - bss_align = 1; - bss_sz = 0; - - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - if (kbuf.buf_align < align) - kbuf.buf_align = align; - kbuf.bufsz = ALIGN(kbuf.bufsz, align); - kbuf.bufsz += sechdrs[i].sh_size; - } else { - /* bss section */ - if (bss_align < align) - bss_align = align; - bss_sz = ALIGN(bss_sz, align); - bss_sz += sechdrs[i].sh_size; - } - } - - /* Determine the bss padding required to align bss properly */ - bss_pad = 0; - if (kbuf.bufsz & (bss_align - 1)) - bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - - kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; - - /* Allocate buffer for purgatory */ - kbuf.buffer = vzalloc(kbuf.bufsz); - if (!kbuf.buffer) { - ret = -ENOMEM; - goto out; - } - - if (kbuf.buf_align < bss_align) - kbuf.buf_align = bss_align; - - /* Add buffer to segment list */ - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - pi->purgatory_load_addr = kbuf.mem; - /* Load SHF_ALLOC sections */ - buf_addr = kbuf.buffer; - load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + kbuf.bufsz + bss_pad; + buf_addr = kbuf->buffer; + load_addr = curr_load_addr = kbuf->mem; + bss_addr = load_addr + kbuf->bufsz; for (i = 0; i < pi->ehdr->e_shnum; i++) { + unsigned long align; + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; @@ -871,24 +885,9 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, /* Update entry point based on load address of text section */ if (entry_sidx >= 0) - entry += sechdrs[entry_sidx].sh_addr; - - /* Make kernel jump to purgatory after shutdown */ - image->start = entry; - - /* Used later to get/set symbol values */ - pi->sechdrs = sechdrs; + kbuf->image->start += sechdrs[entry_sidx].sh_addr; - /* - * Used later to identify which section is purgatory and skip it - * from checksumming. - */ - pi->purgatory_buf = kbuf.buffer; - return ret; -out: - vfree(sechdrs); - vfree(kbuf.buffer); - return ret; + return 0; } static int kexec_apply_relocations(struct kimage *image) @@ -958,16 +957,23 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, { struct purgatory_info *pi = &image->purgatory_info; int ret; + struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, + .buf_min = min, .buf_max = max, + .top_down = top_down }; if (kexec_purgatory_size <= 0) return -EINVAL; pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = __kexec_load_purgatory(image, min, max, top_down); + ret = kexec_purgatory_setup_kbuf(pi, &kbuf); if (ret) return ret; + ret = kexec_purgatory_setup_sechdrs(pi, &kbuf); + if (ret) + goto out_free_kbuf; + ret = kexec_apply_relocations(image); if (ret) goto out; @@ -977,7 +983,7 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, out: vfree(pi->sechdrs); pi->sechdrs = NULL; - +out_free_kbuf: vfree(pi->purgatory_buf); pi->purgatory_buf = NULL; return ret; -- cgit v1.2.3 From f1b1cca39650c9c1dbe26140946a518953f66771 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:32 -0700 Subject: kernel/kexec_file.c: remove unneeded for-loop in kexec_purgatory_setup_sechdrs To update the entry point there is an extra loop over all section headers although this can be done in the main loop. So move it there and eliminate the extra loop and variable to store the 'entry section index'. Also, in the main loop, move the usual case, i.e. non-bss section, out of the extra if-block. Link: http://lkml.kernel.org/r/20180321112751.22196-8-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Reviewed-by: Martin Schwidefsky Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 76 +++++++++++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 878b97bd3067..6f0ec9f2e5c1 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -792,7 +792,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, unsigned char *buf_addr; unsigned char *src; Elf_Shdr *sechdrs; - int entry_sidx = -1; int i; sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); @@ -824,32 +823,11 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, sechdrs[i].sh_offset; } - /* - * Identify entry point section and make entry relative to section - * start. - */ - kbuf->image->start = pi->ehdr->e_entry; - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - - if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - /* Make entry section relative */ - if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && - ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > - pi->ehdr->e_entry)) { - entry_sidx = i; - kbuf->image->start -= sechdrs[i].sh_addr; - break; - } - } - /* Load SHF_ALLOC sections */ buf_addr = kbuf->buffer; load_addr = curr_load_addr = kbuf->mem; bss_addr = load_addr + kbuf->bufsz; + kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; @@ -858,34 +836,40 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type != SHT_NOBITS) { - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *) sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; - } else { + + if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; bss_addr += sechdrs[i].sh_size; + continue; + } + + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *)sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + if (sechdrs[i].sh_flags & SHF_EXECINSTR && + pi->ehdr->e_entry >= sechdrs[i].sh_addr && + pi->ehdr->e_entry < (sechdrs[i].sh_addr + + sechdrs[i].sh_size)) { + kbuf->image->start -= sechdrs[i].sh_addr; + kbuf->image->start += curr_load_addr; } - } - /* Update entry point based on load address of text section */ - if (entry_sidx >= 0) - kbuf->image->start += sechdrs[entry_sidx].sh_addr; + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } return 0; } -- cgit v1.2.3 From 620f697cc27a6d9b09268f47cd13620488ec67af Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:35 -0700 Subject: kernel/kexec_file.c: remove unneeded variables in kexec_purgatory_setup_sechdrs The main loop currently uses quite a lot of variables to update the section headers. Some of them are unnecessary. So clean them up a little. Link: http://lkml.kernel.org/r/20180321112751.22196-9-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6f0ec9f2e5c1..7b63de8a89b6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -785,12 +785,8 @@ out: static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, struct kexec_buf *kbuf) { - unsigned long curr_load_addr; - unsigned long load_addr; unsigned long bss_addr; unsigned long offset; - unsigned char *buf_addr; - unsigned char *src; Elf_Shdr *sechdrs; int i; @@ -823,20 +819,18 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, sechdrs[i].sh_offset; } - /* Load SHF_ALLOC sections */ - buf_addr = kbuf->buffer; - load_addr = curr_load_addr = kbuf->mem; - bss_addr = load_addr + kbuf->bufsz; + offset = 0; + bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; + void *src, *dst; if (!(sechdrs[i].sh_flags & SHF_ALLOC)) continue; align = sechdrs[i].sh_addralign; - if (sechdrs[i].sh_type == SHT_NOBITS) { bss_addr = ALIGN(bss_addr, align); sechdrs[i].sh_addr = bss_addr; @@ -844,31 +838,27 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, continue; } - curr_load_addr = ALIGN(curr_load_addr, align); - offset = curr_load_addr - load_addr; - /* We already modifed ->sh_offset to keep src addr */ - src = (char *)sechdrs[i].sh_offset; - memcpy(buf_addr + offset, src, sechdrs[i].sh_size); - + offset = ALIGN(offset, align); if (sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr + sechdrs[i].sh_size)) { kbuf->image->start -= sechdrs[i].sh_addr; - kbuf->image->start += curr_load_addr; + kbuf->image->start += kbuf->mem + offset; } - /* Store load address and source address of section */ - sechdrs[i].sh_addr = curr_load_addr; + src = (void *)sechdrs[i].sh_offset; + dst = pi->purgatory_buf + offset; + memcpy(dst, src, sechdrs[i].sh_size); + + sechdrs[i].sh_addr = kbuf->mem + offset; /* * This section got copied to temporary buffer. Update * ->sh_offset accordingly. */ - sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); - - /* Advance to the next address */ - curr_load_addr += sechdrs[i].sh_size; + sechdrs[i].sh_offset = (unsigned long)dst; + offset += sechdrs[i].sh_size; } return 0; -- cgit v1.2.3 From 8da0b724959ccd3f8435214ebdaf1aef548967bb Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:39 -0700 Subject: kernel/kexec_file.c: remove mis-use of sh_offset field during purgatory load The current code uses the sh_offset field in purgatory_info->sechdrs to store a pointer to the current load address of the section. Depending whether the section will be loaded or not this is either a pointer into purgatory_info->purgatory_buf or kexec_purgatory. This is not only a violation of the ELF standard but also makes the code very hard to understand as you cannot tell if the memory you are using is read-only or not. Remove this misuse and store the offset of the section in pugaroty_info->purgatory_buf in sh_offset. Link: http://lkml.kernel.org/r/20180321112751.22196-10-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 37 +++++++------------------------------ 1 file changed, 7 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 7b63de8a89b6..269116fd932c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -790,6 +790,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, Elf_Shdr *sechdrs; int i; + /* + * The section headers in kexec_purgatory are read-only. In order to + * have them modifiable make a temporary copy. + */ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); if (!sechdrs) return -ENOMEM; @@ -797,28 +801,6 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); pi->sechdrs = sechdrs; - /* - * We seem to have multiple copies of sections. First copy is which - * is embedded in kernel in read only section. Some of these sections - * will be copied to a temporary buffer and relocated. And these - * sections will finally be copied to their final destination at - * segment load time. - * - * Use ->sh_offset to reflect section address in memory. It will - * point to original read only copy if section is not allocatable. - * Otherwise it will point to temporary copy which will be relocated. - * - * Use ->sh_addr to contain final address of the section where it - * will go during execution time. - */ - for (i = 0; i < pi->ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type == SHT_NOBITS) - continue; - - sechdrs[i].sh_offset = (unsigned long)pi->ehdr + - sechdrs[i].sh_offset; - } - offset = 0; bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; @@ -847,17 +829,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, kbuf->image->start += kbuf->mem + offset; } - src = (void *)sechdrs[i].sh_offset; + src = (void *)pi->ehdr + sechdrs[i].sh_offset; dst = pi->purgatory_buf + offset; memcpy(dst, src, sechdrs[i].sh_size); sechdrs[i].sh_addr = kbuf->mem + offset; - - /* - * This section got copied to temporary buffer. Update - * ->sh_offset accordingly. - */ - sechdrs[i].sh_offset = (unsigned long)dst; + sechdrs[i].sh_offset = offset; offset += sechdrs[i].sh_size; } @@ -1067,7 +1044,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return -EINVAL; } - sym_buf = (char *)sec->sh_offset + sym->st_value; + sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value; if (get_value) memcpy((void *)buf, sym_buf, size); -- cgit v1.2.3 From 3be3f61d25e04ecf90d65d52fad632af5ba8805b Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 13 Apr 2018 15:36:43 -0700 Subject: kernel/kexec_file.c: allow archs to set purgatory load address For s390 new kernels are loaded to fixed addresses in memory before they are booted. With the current code this is a problem as it assumes the kernel will be loaded to an 'arbitrary' address. In particular, kexec_locate_mem_hole searches for a large enough memory region and sets the load address (kexec_bufer->mem) to it. Luckily there is a simple workaround for this problem. By returning 1 in arch_kexec_walk_mem, kexec_locate_mem_hole is turned off. This allows the architecture to set kbuf->mem by hand. While the trick works fine for the kernel it does not for the purgatory as here the architectures don't have access to its kexec_buffer. Give architectures access to the purgatories kexec_buffer by changing kexec_load_purgatory to take a pointer to it. With this change architectures have access to the buffer and can edit it as they need. A nice side effect of this change is that we can get rid of the purgatory_info->purgatory_load_address field. As now the information stored there can directly be accessed from kbuf->mem. Link: http://lkml.kernel.org/r/20180321112751.22196-11-prudo@linux.vnet.ibm.com Signed-off-by: Philipp Rudo Reviewed-by: Martin Schwidefsky Acked-by: Dave Young Cc: AKASHI Takahiro Cc: Eric Biederman Cc: Heiko Carstens Cc: Ingo Molnar Cc: Michael Ellerman Cc: Thiago Jung Bauermann Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 269116fd932c..75d8e7cf040e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -730,8 +730,8 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, int i, ret; sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - bss_align = 1; - bss_sz = 0; + kbuf->buf_align = bss_align = 1; + kbuf->bufsz = bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -763,7 +763,6 @@ static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi, ret = kexec_add_buffer(kbuf); if (ret) goto out; - pi->purgatory_load_addr = kbuf->mem; return 0; out: @@ -901,27 +900,32 @@ static int kexec_apply_relocations(struct kimage *image) return 0; } -/* Load relocatable purgatory object and relocate it appropriately */ -int kexec_load_purgatory(struct kimage *image, unsigned long min, - unsigned long max, int top_down, - unsigned long *load_addr) +/* + * kexec_load_purgatory - Load and relocate the purgatory object. + * @image: Image to add the purgatory to. + * @kbuf: Memory parameters to use. + * + * Allocates the memory needed for image->purgatory_info.sechdrs and + * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible + * to free the memory after use. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf) { struct purgatory_info *pi = &image->purgatory_info; int ret; - struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, - .buf_min = min, .buf_max = max, - .top_down = top_down }; if (kexec_purgatory_size <= 0) return -EINVAL; pi->ehdr = (const Elf_Ehdr *)kexec_purgatory; - ret = kexec_purgatory_setup_kbuf(pi, &kbuf); + ret = kexec_purgatory_setup_kbuf(pi, kbuf); if (ret) return ret; - ret = kexec_purgatory_setup_sechdrs(pi, &kbuf); + ret = kexec_purgatory_setup_sechdrs(pi, kbuf); if (ret) goto out_free_kbuf; @@ -929,7 +933,6 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, if (ret) goto out; - *load_addr = pi->purgatory_load_addr; return 0; out: vfree(pi->sechdrs); -- cgit v1.2.3 From e91c2518a5d22a07642f35d85f39001ad379dae4 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 16 Apr 2018 13:36:46 +0200 Subject: livepatch: Initialize shadow variables safely by a custom callback The existing API allows to pass a sample data to initialize the shadow data. It works well when the data are position independent. But it fails miserably when we need to set a pointer to the shadow structure itself. Unfortunately, we might need to initialize the pointer surprisingly often because of struct list_head. It is even worse because the list might be hidden in other common structures, for example, struct mutex, struct wait_queue_head. For example, this was needed to fix races in ALSA sequencer. It required to add mutex into struct snd_seq_client. See commit b3defb791b26ea06 ("ALSA: seq: Make ioctls race-free") and commit d15d662e89fc667b9 ("ALSA: seq: Fix racy pool initializations") This patch makes the API more safe. A custom constructor function and data are passed to klp_shadow_*alloc() functions instead of the sample data. Note that ctor_data are no longer a template for shadow->data. It might point to any data that might be necessary when the constructor is called. Also note that the constructor is called under klp_shadow_lock. It is an internal spin_lock that synchronizes alloc() vs. get() operations, see klp_shadow_get_or_alloc(). On one hand, this adds a risk of ABBA deadlocks. On the other hand, it allows to do some operations safely. For example, we could add the new structure into an existing list. This must be done only once when the structure is allocated. Reported-by: Nicolai Stange Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/shadow.c | 82 ++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index fdac27588d60..b10a0bbb7f84 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags, bool warn_on_exist) +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data, + bool warn_on_exist) { struct klp_shadow *new_shadow; void *shadow_data; @@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, if (shadow_data) goto exists; - /* Allocate a new shadow variable for use inside the lock below */ + /* + * Allocate a new shadow variable. Fill it with zeroes by default. + * More complex setting can be done by @ctor function. But it is + * called only when the buffer is really used (under klp_shadow_lock). + */ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); if (!new_shadow) return NULL; - new_shadow->obj = obj; - new_shadow->id = id; - - /* Initialize the shadow variable if data provided */ - if (data) - memcpy(new_shadow->data, data, size); - /* Look for again under the lock */ spin_lock_irqsave(&klp_shadow_lock, flags); shadow_data = klp_shadow_get(obj, id); @@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, goto exists; } + new_shadow->obj = obj; + new_shadow->id = id; + + if (ctor) { + int err; + + err = ctor(obj, new_shadow->data, ctor_data); + if (err) { + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n", + obj, id, err); + return NULL; + } + } + /* No found, so attach the newly allocated one */ hash_add_rcu(klp_shadow_hash, &new_shadow->node, (unsigned long)new_shadow->obj); @@ -170,26 +185,32 @@ exists: * klp_shadow_alloc() - allocate and add a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) + * + * Allocates @size bytes for new shadow variable data using @gfp_flags. + * The data are zeroed by default. They are further initialized by @ctor + * function if it is not NULL. The new shadow variable is then added + * to the global hashtable. * - * Allocates @size bytes for new shadow variable data using @gfp_flags - * and copies @size bytes from @data into the new shadow variable's own - * data space. If @data is NULL, @size bytes are still allocated, but - * no copy is performed. The new shadow variable is then added to the - * global hashtable. + * If an existing shadow variable can be found, this routine will + * issue a WARN, exit early and return NULL. * - * If an existing shadow variable can be found, this routine - * will issue a WARN, exit early and return NULL. + * This function guarantees that the constructor function is called only when + * the variable did not exist before. The cost is that @ctor is called + * in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on duplicate or * failure. */ -void *klp_shadow_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, true); } EXPORT_SYMBOL_GPL(klp_shadow_alloc); @@ -197,25 +218,28 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc); * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) * * Returns a pointer to existing shadow data if an shadow * variable is already present. Otherwise, it creates a new shadow * variable like klp_shadow_alloc(). * - * This function guarantees that only one shadow variable exists with - * the given @id for the given @obj. It also guarantees that the shadow - * variable will be initialized by the given @data only when it did not - * exist before. + * This function guarantees that only one shadow variable exists with the given + * @id for the given @obj. It also guarantees that the constructor function + * will be called only when the variable did not exist before. The cost is + * that @ctor is called in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on failure. */ -void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, false); } EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); -- cgit v1.2.3 From 3b2c77d000fe9f7d02e9e726e00dccf9f92b256f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 16 Apr 2018 13:36:47 +0200 Subject: livepatch: Allow to call a custom callback when freeing shadow variables We might need to do some actions before the shadow variable is freed. For example, we might need to remove it from a list or free some data that it points to. This is already possible now. The user can get the shadow variable by klp_shadow_get(), do the necessary actions, and then call klp_shadow_free(). This patch allows to do it a more elegant way. The user could implement the needed actions in a callback that is passed to klp_shadow_free() as a parameter. The callback usually does reverse operations to the constructor callback that can be called by klp_shadow_*alloc(). It is especially useful for klp_shadow_free_all(). There we need to do these extra actions for each found shadow variable with the given ID. Note that the memory used by the shadow variable itself is still released later by rcu callback. It is needed to protect internal structures that keep all shadow variables. But the destructor is called immediately. The shadow variable must not be access anyway after klp_shadow_free() is called. The user is responsible to protect this any suitable way. Be aware that the destructor is called under klp_shadow_lock. It is the same as for the contructor in klp_shadow_alloc(). Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/shadow.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index b10a0bbb7f84..83958c814439 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -243,15 +243,26 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long id, } EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); +static void klp_shadow_free_struct(struct klp_shadow *shadow, + klp_shadow_dtor_t dtor) +{ + hash_del_rcu(&shadow->node); + if (dtor) + dtor(shadow->obj, shadow->data); + kfree_rcu(shadow, rcu_head); +} + /** * klp_shadow_free() - detach and free a shadow variable * @obj: pointer to parent object * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for this shadow variable * instance, callers should stop referencing it accordingly. */ -void klp_shadow_free(void *obj, unsigned long id) +void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -263,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id) (unsigned long)obj) { if (klp_shadow_match(shadow, obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); + klp_shadow_free_struct(shadow, dtor); break; } } @@ -276,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free); /** * klp_shadow_free_all() - detach and free all <*, id> shadow variables * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for all <*, id> shadow variable * instances, callers should stop referencing them accordingly. */ -void klp_shadow_free_all(unsigned long id) +void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -290,10 +302,8 @@ void klp_shadow_free_all(unsigned long id) /* Delete all <*, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { - if (klp_shadow_match(shadow, shadow->obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); - } + if (klp_shadow_match(shadow, shadow->obj, id)) + klp_shadow_free_struct(shadow, dtor); } spin_unlock_irqrestore(&klp_shadow_lock, flags); -- cgit v1.2.3