From 49ca6153208f6efc409c1deb82dd5bcbb519d7e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Aug 2021 09:39:31 +0200 Subject: bpf: Relicense disassembler as GPL-2.0-only OR BSD-2-Clause Some time ago we dual-licensed both libbpf and bpftool through commits 1bc38b8ff6cc ("libbpf: relicense libbpf as LGPL-2.1 OR BSD-2-Clause") and 907b22365115 ("tools: bpftool: dual license all files"). The latter missed the disasm.{c,h} which we pull in via kernel/bpf/ such that we have a single source for verifier as well as bpftool asm dumping, see also f4ac7e0b5cc8 ("bpf: move instruction printing into a separate file"). It is currently GPL-2.0-only and missed the conversion in 907b22365115, therefore relicense the two as GPL-2.0-only OR BSD-2-Clause as well. Spotted-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Brendan Jackman Acked-by: Jakub Kicinski Acked-by: Jiri Olsa Acked-by: Simon Horman Acked-by: Martin KaFai Lau Acked-by: Xu Kuohai Acked-by: Edward Cree --- kernel/bpf/disasm.c | 2 +- kernel/bpf/disasm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ -- cgit v1.2.3 From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 31 Aug 2021 08:38:19 +0200 Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT 730633f0b7f95 became the first direct caller of __init_rwsem() vs the usual init_rwsem(), exposing PREEMPT_RT's lack thereof. Add it. [ tglx: Move it out of line ] Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de --- kernel/locking/rwsem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9215b4d6a9de..000e8d5a2884 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #include "rwbase_rt.c" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __rwsem_init(struct rw_semaphore *sem, const char *name, +void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); -} -EXPORT_SYMBOL(__rwsem_init); #endif +} +EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { -- cgit v1.2.3 From a974b54036f79dd5e395e9f6c80c3decb4661a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Aug 2021 14:18:40 +0100 Subject: futex: Return error code instead of assigning it without effect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check on the rt_waiter and top_waiter->pi_state is assigning an error return code to ret but this later gets re-assigned, hence the check is ineffective. Return -EINVAL rather than assigning it to ret which was the original intent. Fixes: dc7109aaa233 ("futex: Validate waiter correctly in futex_proxy_trylock_atomic()") Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Reviewed-by: AndrĂ© Almeida Link: https://lore.kernel.org/r/20210818131840.34262-1-colin.king@canonical.com --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e7b4c6121da4..30e7daebaec8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2025,7 +2025,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) - ret = -EINVAL; + return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) -- cgit v1.2.3 From 4f07ec0d76f242d4ca0f0c0c6f7293c28254a554 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:48 +0200 Subject: futex: Prevent inconsistent state and exit race The recent rework of the requeue PI code introduced a possibility for going back to user space in inconsistent state: CPU 0 CPU 1 requeue_futex() if (lock_pifutex_user()) { dequeue_waiter(); wake_waiter(task); sched_in(task); return_from_futex_syscall(); ---> Inconsistent state because PI state is not established It becomes worse if the woken up task immediately exits: sys_exit(); attach_pistate(vpid); <--- FAIL Attach the pi state before dequeuing and waking the waiter. If the waiter gets a spurious wakeup before the dequeue operation it will wait in futex_requeue_pi_wakeup_sync() and therefore cannot return and exit. Fixes: 07d91ef510fb ("futex: Prevent requeue_pi() lock nesting issue on RT") Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de --- kernel/futex.c | 98 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 30e7daebaec8..04164d440d98 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, newval |= FUTEX_WAITERS; ret = lock_pi_update_atomic(uaddr, uval, newval); - /* If the take over worked, return 1 */ - return ret < 0 ? ret : 1; + if (ret) + return ret; + + /* + * If the waiter bit was requested the caller also needs PI + * state attached to the new owner of the user space futex. + * + * @task is guaranteed to be alive and it cannot be exiting + * because it is either sleeping or waiting in + * futex_requeue_pi_wakeup_sync(). + */ + if (set_waiters) { + ret = attach_to_pi_owner(uaddr, newval, key, ps, + exiting); + WARN_ON(ret); + } + return 1; } /* @@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EAGAIN; /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { - /* Dequeue, wake up and update top_waiter::requeue_state */ + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); @@ -2208,19 +2230,26 @@ retry_private: &exiting, nr_requeue); /* - * At this point the top_waiter has either taken uaddr2 or is - * waiting on it. If the former, then the pi_state will not - * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, @ret contains the - * VPID of the top waiter task. - * If the lock was not taken, we have pi_state and an initial - * refcount on it. In case of an error we have nothing. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. * * The top waiter's requeue_state is up to date: * - * - If the lock was acquired atomically (ret > 0), then + * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an @@ -2234,36 +2263,20 @@ retry_private: * the same sanity checks for requeue_pi as the loop * below does. */ - if (ret > 0) { - WARN_ON(pi_state); - task_count++; - /* - * If futex_proxy_trylock_atomic() acquired the - * user space futex, then the user space value - * @uaddr2 has been set to the @hb1's top waiter - * task VPID. This task is guaranteed to be alive - * and cannot be exiting because it is either - * sleeping or blocked on @hb2 lock. - * - * The @uaddr2 futex cannot have waiters either as - * otherwise futex_proxy_trylock_atomic() would not - * have succeeded. - * - * In order to requeue waiters to @hb2, pi state is - * required. Hand in the VPID value (@ret) and - * allocate PI state with an initial refcount on - * it. - */ - ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, - &exiting); - WARN_ON(ret); - } - switch (ret) { case 0: /* We hold a reference on the pi state. */ break; + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. @@ -2395,9 +2408,8 @@ retry_private: } /* - * We took an extra initial reference to the pi_state either in - * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need - * to drop it here again. + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); -- cgit v1.2.3 From 249955e51c8136189b3c66f54e212981a1350a0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:50 +0200 Subject: futex: Clarify comment for requeue_pi_wake_futex() It's slightly confusing. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.618613025@linutronix.de --- kernel/futex.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 04164d440d98..82cd270f25a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1954,12 +1954,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -- cgit v1.2.3 From 340576590dac4bb58d532a8ad5bfa806d8ab473c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:51 +0200 Subject: futex: Avoid redundant task lookup No need to do the full VPID based task lookup and validation of the top waiter when the user space futex was acquired on it's behalf during the requeue_pi operation. The task is known already and it cannot go away before requeue_pi_wake_futex() has been invoked. Split out the actual attach code from attach_pi_state_owner() and use that instead of the full blown variant. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.676104881@linutronix.de --- kernel/futex.c | 67 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 82cd270f25a0..a316dce74c3d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, return -ESRCH; } +static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, + struct futex_pi_state **ps) +{ + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + struct futex_pi_state *pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + + *ps = pi_state; +} /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. @@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; - struct futex_pi_state *pi_state; struct task_struct *p; /* @@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return ret; } - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - /* - * Assignment without holding pi_state->pi_mutex.wait_lock is safe - * because there is no concurrency as the object is not published yet. - */ - pi_state->owner = p; + __attach_to_pi_owner(p, key, ps); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); - *ps = pi_state; - return 0; } @@ -1464,11 +1468,14 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * @task is guaranteed to be alive and it cannot be exiting * because it is either sleeping or waiting in * futex_requeue_pi_wakeup_sync(). + * + * No need to do the full attach_to_pi_owner() exercise + * because @task is known and valid. */ if (set_waiters) { - ret = attach_to_pi_owner(uaddr, newval, key, ps, - exiting); - WARN_ON(ret); + raw_spin_lock_irq(&task->pi_lock); + __attach_to_pi_owner(task, key, ps); + raw_spin_unlock_irq(&task->pi_lock); } return 1; } -- cgit v1.2.3 From d66e3edee7af87fe212df611ab9846b987a5070f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 22:47:06 +0200 Subject: futex: Remove unused variable 'vpid' in futex_proxy_trylock_atomic() The recent bug fix left the variable 'vpid' and an assignment to it around, but the variable is otherwise unused. clang dose not complain even with W=1, but gcc exposed this. Fixes: 4f07ec0d76f2 ("futex: Prevent inconsistent state and exit race") Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a316dce74c3d..c15ad276fd15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2034,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, { struct futex_q *top_waiter = NULL; u32 curval; - int ret, vpid; + int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -2079,7 +2079,6 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * the user space lock can be acquired then PI state is attached to * the new owner (@top_waiter->task) when @set_waiters is true. */ - vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { -- cgit v1.2.3 From 54357f0c9149c871e5e4b83ad385a6f2ad3a749f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Aug 2021 15:26:25 +0200 Subject: tracing: Add migrate-disabled counter to tracing output. migrate_disable() forbids task migration to another CPU. It is available since v5.11 and has already users such as highmem or BPF. It is useful to observe this task state in tracing which already has other states like the preemption counter. Instead of adding the migrate disable counter as a new entry to struct trace_entry, which would extend the whole struct by four bytes, it is squashed into the preempt-disable counter. The lower four bits represent the preemption counter, the upper four bits represent the migrate disable counter. Both counter shouldn't exceed 15 but if they do, there is a safety net which caps the value at 15. Add the migrate-disable counter to the trace entry so it shows up in the trace. Due to the users mentioned above, it is already possible to observe it: | bash-1108 [000] ...21 73.950578: rss_stat: mm_id=2213312838 curr=0 type=MM_ANONPAGES size=8192B | bash-1108 [000] d..31 73.951222: irq_disable: caller=flush_tlb_mm_range+0x115/0x130 parent=ptep_clear_flush+0x42/0x50 | bash-1108 [000] d..31 73.951222: tlb_flush: pages:1 reason:local mm shootdown (3) The last value is the migrate-disable counter. Things that popped up: - trace_print_lat_context() does not print the migrate counter. Not sure if it should. It is used in "verbose" mode and uses 8 digits and I'm not sure ther is something processing the value. - trace_define_common_fields() now defines a different variable. This probably breaks things. No ide what to do in order to preserve the old behaviour. Since this is used as a filter it should be split somehow to be able to match both nibbles here. Link: https://lkml.kernel.org/r/20210810132625.ylssabmsrkygokuv@linutronix.de Signed-off-by: Thomas Gleixner [bigeasy: patch description.] Signed-off-by: Sebastian Andrzej Siewior [ SDR: Removed change to common_preempt_count field name ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 26 +++++++++++++++++++------- kernel/trace/trace_events.c | 1 + kernel/trace/trace_output.c | 11 ++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 489924cde4f8..b554e18924f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2603,6 +2603,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return); +static unsigned short migration_disable_value(void) +{ +#if defined(CONFIG_SMP) + return current->migration_disabled; +#else + return 0; +#endif +} + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; @@ -2621,7 +2630,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (pc & 0xff); + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * @@ -4189,9 +4199,10 @@ static void print_lat_help_header(struct seq_file *m) "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + "# |||| / _-=> migrate-disable \n" + "# ||||| / delay \n" + "# cmd pid |||||| time | caller \n" + "# \\ / |||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -4229,9 +4240,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1349b6de5eeb..830b3b9940f4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -181,6 +181,7 @@ static int trace_define_common_fields(void) __common_field(unsigned short, type); __common_field(unsigned char, flags); + /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a0bf446bb034..c2ca40e8595b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -492,8 +492,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); else trace_seq_putc(s, '.'); @@ -656,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_seq_printf( s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + entry->preempt_count & 0xf, iter->idx); } else { lat_print_generic(s, entry, iter->cpu); } -- cgit v1.2.3 From 5615e088b43d564aec08ccfaecb21ba484a1b4d6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2021 14:51:50 +0300 Subject: tracing: Fix some alloc_event_probe() error handling bugs There are two bugs in this code. First, if the kzalloc() fails it leads to a NULL dereference of "ep" on the next line. Second, if the alloc_event_probe() function returns an error then it leads to an error pointer dereference in the caller. Link: https://lkml.kernel.org/r/20210824115150.GI31143@kili Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 56a96e9750cf..3044b762cbd7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -151,7 +151,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { - trace_event_put_ref(ep->event); + trace_event_put_ref(event); goto error; } ep->event = event; @@ -851,7 +851,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) ret = PTR_ERR(ep); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto error; /* We know ep is not allocated */ + ep = NULL; + goto error; } argc -= 2; argv += 2; -- cgit v1.2.3 From 3c91dda97eea704ac257ddb138d1154adab8db62 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 7 Sep 2021 19:58:18 -0700 Subject: kernel/acct.c: use dedicated helper to access rlimit values Use rlimit() helper instead of manually writing whole chain from task to rlimit value. See patch "posix-cpu-timers: Use dedicated helper to access rlimit values". Link: https://lkml.kernel.org/r/20210728030822.524789-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Randy Dunlap Cc: sh_def@163.com Cc: Yang Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a64102be2bb0..23a7ab8e6cbc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* * Accounting records are not subject to resource limits. */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + flim = rlimit(RLIMIT_FSIZE); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); -- cgit v1.2.3 From 2d186afd04d669fe9c48b994c41a7405a3c9f16d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 7 Sep 2021 19:58:21 -0700 Subject: profiling: fix shift-out-of-bounds bugs Syzbot reported shift-out-of-bounds bug in profile_init(). The problem was in incorrect prof_shift. Since prof_shift value comes from userspace we need to clamp this value into [0, BITS_PER_LONG -1] boundaries. Second possible shiht-out-of-bounds was found by Tetsuo: sample_step local variable in read_profile() had "unsigned int" type, but prof_shift allows to make a BITS_PER_LONG shift. So, to prevent possible shiht-out-of-bounds sample_step type was changed to "unsigned long". Also, "unsigned short int" will be sufficient for storing [0, BITS_PER_LONG] value, that's why there is no need for "unsigned long" prof_shift. Link: https://lkml.kernel.org/r/20210813140022.5011-1-paskripkin@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+e68c89a9510c159d9684@syzkaller.appspotmail.com Suggested-by: Tetsuo Handa Signed-off-by: Pavel Skripkin Cc: Thomas Gleixner Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index c2ebddb5e974..eb9c7f0f5ac5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -41,7 +41,8 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; +static unsigned long prof_len; +static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); @@ -67,8 +68,8 @@ int profile_setup(char *str) if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel sleep profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel sleep profiling enabled (shift: %u)\n", prof_shift); #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); @@ -78,21 +79,21 @@ int profile_setup(char *str) if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel schedule profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel schedule profiling enabled (shift: %u)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; if (str[strlen(kvmstr)] == ',') str += strlen(kvmstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel KVM profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel KVM profiling enabled (shift: %u)\n", prof_shift); } else if (get_option(&str, &par)) { - prof_shift = par; + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; - pr_info("kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } return 1; @@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long p = *ppos; ssize_t read; char *pnt; - unsigned int sample_step = 1 << prof_shift; + unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) -- cgit v1.2.3 From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Sep 2021 20:00:00 -0700 Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. [rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n] [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none [akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter] Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Anton Blanchard Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index c82399c1618a..e2cf8c22b539 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) return NULL; } +static int user_epoll_alloc(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); +#else + return 0; +#endif +} + +static void user_epoll_free(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + percpu_counter_destroy(&up->epoll_watches); +#endif +} + /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + user_epoll_free(up); kmem_cache_free(uid_cachep, up); } @@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); + if (user_epoll_alloc(new)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +238,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (user_epoll_alloc(&root_user)) + panic("root_user epoll percpu counter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); -- cgit v1.2.3 From 05da8113c9ba63a8913e6c73dc06ed01cae55f68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 20:00:35 -0700 Subject: kernel/fork.c: unexport get_{mm,task}_exe_file Only used by core code and the tomoyo which can't be a module either. Link: https://lkml.kernel.org/r/20210820095430.445242-1-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..df7296474ecd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,7 +1187,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm) rcu_read_unlock(); return exe_file; } -EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_exe_file - acquire a reference to the task's executable file @@ -1210,7 +1209,6 @@ struct file *get_task_exe_file(struct task_struct *task) task_unlock(task); return exe_file; } -EXPORT_SYMBOL(get_task_exe_file); /** * get_task_mm - acquire a reference to the task's mm -- cgit v1.2.3 From e1fbbd073137a9d63279f6bf363151a938347640 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 7 Sep 2021 20:00:41 -0700 Subject: prctl: allow to setup brk for et_dyn executables Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data. For example a test program shows | # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000 and when executed via ld-linux | # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000 This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call. Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov Reported-by: Keno Fischer Acked-by: Andrey Vagin Tested-by: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Pavel Tikhomirov Cc: Alexander Mikhalitsyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..6ec50924b517 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1959,13 +1959,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; - /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */ -- cgit v1.2.3 From 4b6b08f2e45edda4c067ac40833e3c1f84383c0b Mon Sep 17 00:00:00 2001 From: "Qiang.Zhang" Date: Tue, 31 Aug 2021 10:29:19 +0800 Subject: tracing/osnoise: Fix missed cpus_read_unlock() in start_per_cpu_kthreads() When start_kthread() return error, the cpus_read_unlock() need to be called. Link: https://lkml.kernel.org/r/20210831022919.27630-1-qiang.zhang@windriver.com Cc: Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Acked-by: Daniel Bristot de Oliveira Signed-off-by: Qiang.Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 65b08b8e5bf8..ce053619f289 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1548,7 +1548,7 @@ static int start_kthread(unsigned int cpu) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - int retval; + int retval = 0; int cpu; cpus_read_lock(); @@ -1568,13 +1568,13 @@ static int start_per_cpu_kthreads(struct trace_array *tr) retval = start_kthread(cpu); if (retval) { stop_per_cpu_kthreads(); - return retval; + break; } } cpus_read_unlock(); - return 0; + return retval; } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 0be083cee42ec78ba6f9148f5b12a00aa2fb8bd3 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 1 Sep 2021 16:55:13 +0300 Subject: tracing: synth events: increase max fields count Sometimes it is useful to construct larger synthetic trace events. Increase 'SYNTH_FIELDS_MAX' (maximum number of fields in a synthetic event) from 32 to 64. Link: https://lkml.kernel.org/r/20210901135513.3087062-1-dedekind1@gmail.com Signed-off-by: Artem Bityutskiy Acked-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_synth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index 4007fe95cf42..b29595fe3ac5 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -5,7 +5,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 +#define SYNTH_FIELDS_MAX 64 #define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ -- cgit v1.2.3 From c910db943d35d4ac4b77570ece76e0799af24233 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 2 Sep 2021 15:57:12 -0500 Subject: tracing: Dynamically allocate the per-elt hist_elt_data array Setting the hist_elt_data.field_var_str[] array unconditionally to a size of SYNTH_FIELD_MAX elements wastes space unnecessarily. The actual number of elements needed can be calculated at run-time instead. In most cases, this will save a lot of space since it's a per-elt array which isn't normally close to being full. It also allows us to increase SYNTH_FIELD_MAX without worrying about even more wastage when we do that. Link: https://lkml.kernel.org/r/d52ae0ad5e1b59af7c4f54faf3fc098461fd82b3.camel@kernel.org Signed-off-by: Tom Zanussi Tested-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9d91b1c06957..a6061a69aa84 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -508,7 +508,8 @@ struct track_data { struct hist_elt_data { char *comm; u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; + char **field_var_str; + int n_field_var_str; }; struct snapshot_context { @@ -1401,9 +1402,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data) { unsigned int i; - for (i = 0; i < SYNTH_FIELDS_MAX; i++) + for (i = 0; i < elt_data->n_field_var_str; i++) kfree(elt_data->field_var_str[i]); + kfree(elt_data->field_var_str); + kfree(elt_data->comm); kfree(elt_data); } @@ -1451,6 +1454,13 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) size = STR_VAR_LEN_MAX; + elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL); + if (!elt_data->field_var_str) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + elt_data->n_field_var_str = n_str; + for (i = 0; i < n_str; i++) { elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); if (!elt_data->field_var_str[i]) { -- cgit v1.2.3 From cfd799837dbc48499abb05d1891b3d9992354d3a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 04:38:03 +0900 Subject: tracing/boot: Fix to loop on only subkeys Since the commit e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") allows to co-exist a value node and key nodes under a node, xbc_node_for_each_child() is not only returning key node but also a value node. In the boot-time tracing using xbc_node_for_each_child() to iterate the events, groups and instances, but those must be key nodes. Thus it must use xbc_node_for_each_subkey(). Link: https://lkml.kernel.org/r/163112988361.74896.2267026262061819145.stgit@devnote2 Fixes: e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1060b0446032..388e65d05978 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -522,14 +522,14 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ - xbc_node_for_each_child(node, gnode) { + xbc_node_for_each_subkey(node, gnode) { data = xbc_node_get_data(gnode); if (!strcmp(data, "enable")) { enable_all = true; continue; } enable = false; - xbc_node_for_each_child(gnode, enode) { + xbc_node_for_each_subkey(gnode, enode) { data = xbc_node_get_data(enode); if (!strcmp(data, "enable")) { enable = true; @@ -625,7 +625,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!node) return; - xbc_node_for_each_child(node, inode) { + xbc_node_for_each_subkey(node, inode) { p = xbc_node_get_data(inode); if (!p || *p == '\0') continue; -- cgit v1.2.3 From 4b692e861619353ce069e547a67c8d0e32d9ef3d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:10 -0700 Subject: kexec: move locking into do_kexec_load Patch series "compat: remove compat_alloc_user_space", v5. Going through compat_alloc_user_space() to convert indirect system call arguments tends to add complexity compared to handling the native and compat logic in the same code. This patch (of 6): The locking is the same between the native and compat version of sys_kexec_load(), so it can be done in the common implementation to reduce duplication. Link: https://lkml.kernel.org/r/20210727144859.4150043-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20210727144859.4150043-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Al Viro Cc: Feng Tang Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c82c6c06f051..9c7aef8f4bb6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -110,6 +110,17 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long i; int ret; + /* + * Because we write directly to the reserved memory region when loading + * crash kernels we need a mutex here to prevent multiple crash kernels + * from attempting to load simultaneously, and to prevent a crash kernel + * from loading over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) @@ -121,7 +132,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (nr_segments == 0) { /* Uninstall image */ kimage_free(xchg(dest_image, NULL)); - return 0; + ret = 0; + goto out_unlock; } if (flags & KEXEC_ON_CRASH) { /* @@ -134,7 +146,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); if (ret) - return ret; + goto out_unlock; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -171,6 +183,8 @@ out: arch_kexec_protect_crashkres(); kimage_free(image); +out_unlock: + mutex_unlock(&kexec_mutex); return ret; } @@ -247,21 +261,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, segments, flags); - mutex_unlock(&kexec_mutex); - return result; } @@ -301,21 +302,8 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, ksegments, flags); - mutex_unlock(&kexec_mutex); - return result; } #endif -- cgit v1.2.3 From 5d700a0fd71ded7096b97f01d276efc1a6579613 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:13 -0700 Subject: kexec: avoid compat_alloc_user_space kimage_alloc_init() expects a __user pointer, so compat_sys_kexec_load() uses compat_alloc_user_space() to convert the layout and put it back onto the user space caller stack. Moving the user space access into the syscall handler directly actually makes the code simpler, as the conversion for compat mode can now be done on kernel memory. Link: https://lkml.kernel.org/r/20210727144859.4150043-3-arnd@kernel.org Link: https://lore.kernel.org/lkml/YPbtsU4GX6PL7%2F42@infradead.org/ Link: https://lore.kernel.org/lkml/m1y2cbzmnw.fsf@fess.ebiederm.org/ Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 61 ++++++++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c7aef8f4bb6..b5e40f069768 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -19,26 +19,9 @@ #include "kexec_internal.h" -static int copy_user_segment_list(struct kimage *image, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int ret; - size_t segment_bytes; - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - ret = copy_from_user(image->segment, segments, segment_bytes); - if (ret) - ret = -EFAULT; - - return ret; -} - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, + struct kexec_segment *segments, unsigned long flags) { int ret; @@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, return -ENOMEM; image->start = entry; - - ret = copy_user_segment_list(image, nr_segments, segments); - if (ret) - goto out_free_image; + image->nr_segments = nr_segments; + memcpy(image->segment, segments, nr_segments * sizeof(*segments)); if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ @@ -104,7 +85,7 @@ out_free_image: } static int do_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, unsigned long flags) + struct kexec_segment *segments, unsigned long flags) { struct kimage **dest_image, *image; unsigned long i; @@ -250,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments, SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - int result; + struct kexec_segment *ksegments; + unsigned long result; result = kexec_load_check(nr_segments, flags); if (result) @@ -261,7 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - result = do_kexec_load(entry, nr_segments, segments, flags); + ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0])); + if (IS_ERR(ksegments)) + return PTR_ERR(ksegments); + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + kfree(ksegments); return result; } @@ -273,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, flags) { struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; + struct kexec_segment *ksegments; unsigned long i, result; result = kexec_load_check(nr_segments, flags); @@ -286,24 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]), + GFP_KERNEL); + if (!ksegments) + return -ENOMEM; + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) - return -EFAULT; + goto fail; - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; + ksegments[i].buf = compat_ptr(in.buf); + ksegments[i].bufsz = in.bufsz; + ksegments[i].mem = in.mem; + ksegments[i].memsz = in.memsz; } result = do_kexec_load(entry, nr_segments, ksegments, flags); +fail: + kfree(ksegments); return result; } #endif -- cgit v1.2.3 From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 64578adfe115..f43d89d92860 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); -COND_SYSCALL_COMPAT(mbind); COND_SYSCALL(get_mempolicy); -COND_SYSCALL_COMPAT(get_mempolicy); COND_SYSCALL(set_mempolicy); -COND_SYSCALL_COMPAT(set_mempolicy); COND_SYSCALL(migrate_pages); -COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); -COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3 From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 05adfd6fa8bf..55551989d9da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset); - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3 From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ff5be23800af..38681ad44c76 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1063,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3 From e5480572706da1b2c2dc2c6484eab64f92b9263b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Sep 2021 11:44:11 +0200 Subject: locking/rtmutex: Fix ww_mutex deadlock check Dan reported that rt_mutex_adjust_prio_chain() can be called with .orig_waiter == NULL however commit a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") unconditionally dereferences it. Since both call-sites that have .orig_waiter == NULL don't care for the return value, simply disable the deadlock squash by adding the NULL check. Notably, both callers use the deadlock condition as a termination condition for the iteration; once detected, it is sure that (de)boosting is done. Arguably step [3] would be a more natural termination point, but it's dubious whether adding a third deadlock detection state would improve the code. Fixes: a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/YS9La56fHMiCCo75@hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8eabdc79602b..6bb116c559b4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * other configuration and we fail to report; also, see * lockdep. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) ret = 0; raw_spin_unlock(&lock->wait_lock); -- cgit v1.2.3 From 9848417926353daa59d2b05eb26e185063dbac6e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 6 Sep 2021 13:30:34 +0200 Subject: sched/idle: Make the idle timer expire in hard interrupt context The intel powerclamp driver will setup a per-CPU worker with RT priority. The worker will then invoke play_idle() in which it remains in the idle poll loop until it is stopped by the timer it started earlier. That timer needs to expire in hard interrupt context on PREEMPT_RT. Otherwise the timer will expire in ksoftirqd as a SOFT timer but that task won't be scheduled on the CPU because its priority is lower than the priority of the worker which is in the idle loop. Always expire the idle timer in hard interrupt context. Reported-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210906113034.jgfxrjdvxnjqgtmc@linutronix.de --- kernel/sched/idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 912b47aa99d8..d17b0a5ce6ac 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); it.timer.function = idle_inject_timer_fn; hrtimer_start(&it.timer, ns_to_ktime(duration_ns), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); while (!READ_ONCE(it.done)) do_idle(); -- cgit v1.2.3 From 868ad33bfa3bf39960982682ad3a0f8ebda1656e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 28 Aug 2021 15:55:52 +0200 Subject: sched: Prevent balance_push() on remote runqueues sched_setscheduler() and rt_mutex_setprio() invoke the run-queue balance callback after changing priorities or the scheduling class of a task. The run-queue for which the callback is invoked can be local or remote. That's not a problem for the regular rq::push_work which is serialized with a busy flag in the run-queue struct, but for the balance_push() work which is only valid to be invoked on the outgoing CPU that's wrong. It not only triggers the debug warning, but also leaves the per CPU variable push_work unprotected, which can result in double enqueues on the stop machine list. Remove the warning and validate that the function is invoked on the outgoing CPU. Fixes: ae7927023243 ("sched: Optimize finish_lock_switch()") Reported-by: Sebastian Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87zgt1hdw7.ffs@tglx --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3b27c6c5153..b21a1857b75a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8523,7 +8523,6 @@ static void balance_push(struct rq *rq) struct task_struct *push_task = rq->curr; lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* * Ensure the thing is persistent until balance_push_set(.on = false); @@ -8531,9 +8530,10 @@ static void balance_push(struct rq *rq) rq->balance_callback = &balance_push_callback; /* - * Only active while going offline. + * Only active while going offline and when invoked on the outgoing + * CPU. */ - if (!cpu_dying(rq->cpu)) + if (!cpu_dying(rq->cpu) || rq != this_rq()) return; /* -- cgit v1.2.3 From a3928f877e7b153dbc243d6329b3777ac75b6004 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:23 +0900 Subject: tracing/boot: Fix trace_boot_hist_add_array() to check array is value trace_boot_hist_add_array() uses the combination of xbc_node_find_child() and xbc_node_get_child() to get the child node of the key node. But since it missed to check the child node is data node or not, user can pass the subkey node for the array node (anode). To avoid this issue, check the array node is a data node. Actually, there is xbc_node_find_value(node, key, vnode), which ensures the @vnode is a value node, so use it in trace_boot_hist_add_array() to fix this issue. Link: https://lkml.kernel.org/r/163119458308.161018.1516455973625940212.stgit@devnote2 Fixes: e66ed86ca6c5 ("tracing/boot: Add per-event histogram action options") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 388e65d05978..a6be48b24774 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -219,13 +219,12 @@ static int __init trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, char *end, const char *key) { - struct xbc_node *knode, *anode; + struct xbc_node *anode; const char *p; char sep; - knode = xbc_node_find_child(hnode, key); - if (knode) { - anode = xbc_node_get_child(knode); + p = xbc_node_find_value(hnode, key, &anode); + if (p) { if (!anode) { pr_err("hist.%s requires value(s).\n", key); return -EINVAL; -- cgit v1.2.3 From 5f8895b27da2656e5e2be029acfb68c016f03326 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:30 +0900 Subject: tracing/boot: Fix to check the histogram control param is a leaf node Since xbc_node_find_child() doesn't ensure the returned node is a leaf node (key-value pair or do not have subkeys), use xbc_node_find_value to ensure the histogram control parameter is a leaf node in trace_boot_compose_hist_cmd(). Link: https://lkml.kernel.org/r/163119459059.161018.18341288218424528962.stgit@devnote2 Fixes: e66ed86ca6c5 ("tracing/boot: Add per-event histogram action options") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index a6be48b24774..db6ee372dc6d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -385,11 +385,11 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) } /* Histogram control attributes (mutual exclusive) */ - if (xbc_node_find_child(hnode, "pause")) + if (xbc_node_find_value(hnode, "pause", NULL)) append_printf(&buf, end, ":pause"); - else if (xbc_node_find_child(hnode, "continue")) + else if (xbc_node_find_value(hnode, "continue", NULL)) append_printf(&buf, end, ":continue"); - else if (xbc_node_find_child(hnode, "clear")) + else if (xbc_node_find_value(hnode, "clear", NULL)) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ -- cgit v1.2.3 From 5dfe50b05588010f347cb2f436434bf22b7a84ed Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 22:36:38 +0900 Subject: bootconfig: Rename xbc_node_find_child() to xbc_node_find_subkey() Rename xbc_node_find_child() to xbc_node_find_subkey() for clarifying that function returns a key node (no value node). Since there are xbc_node_for_each_child() (loop on all child nodes) and xbc_node_for_each_subkey() (loop on only subkey nodes), this name distinction is necessary to avoid confusing users. Link: https://lkml.kernel.org/r/163119459826.161018.11200274779483115300.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index db6ee372dc6d..8d252f63cd78 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -262,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ - knode = xbc_node_find_child(hnode, "trace"); + knode = xbc_node_find_subkey(hnode, "trace"); if (!knode) - knode = xbc_node_find_child(hnode, "save"); + knode = xbc_node_find_subkey(hnode, "save"); if (knode) { anode = xbc_node_get_child(knode); @@ -283,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, sep = ','; } append_printf(bufp, end, ")"); - } else if (xbc_node_find_child(hnode, "snapshot")) { + } else if (xbc_node_find_subkey(hnode, "snapshot")) { append_printf(bufp, end, ".snapshot()"); } else { pr_err("hist.%s requires an action.\n", @@ -314,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, break; } - if (xbc_node_find_child(hnode, param)) + if (xbc_node_find_subkey(hnode, param)) ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); return ret; @@ -374,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) if (p) append_printf(&buf, end, ":name=%s", p); - node = xbc_node_find_child(hnode, "var"); + node = xbc_node_find_subkey(hnode, "var"); if (node) { xbc_node_for_each_key_value(node, knode, p) { /* Expression must not include spaces. */ @@ -393,13 +393,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ - node = xbc_node_find_child(hnode, "onmax"); + node = xbc_node_find_subkey(hnode, "onmax"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onchange"); + node = xbc_node_find_subkey(hnode, "onchange"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onmatch"); + node = xbc_node_find_subkey(hnode, "onmatch"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; @@ -436,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file, } } - if (xbc_node_find_child(hnode, "keys")) { + if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) @@ -495,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); } - anode = xbc_node_find_child(enode, "hist"); + anode = xbc_node_find_subkey(enode, "hist"); if (anode) trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) @@ -517,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) bool enable, enable_all = false; const char *data; - node = xbc_node_find_child(node, "event"); + node = xbc_node_find_subkey(node, "event"); if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ @@ -620,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node) struct trace_array *tr; const char *p; - node = xbc_node_find_child(node, "instance"); + node = xbc_node_find_subkey(node, "instance"); if (!node) return; -- cgit v1.2.3 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- kernel/bpf/stackmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } -- cgit v1.2.3 From 0e6491b559704da720f6da09dd0a52c4df44c514 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: bpf: Add oversize check before call kvcalloc() Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..e76b55917905 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || -- cgit v1.2.3 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- kernel/cgroup/cgroup.c | 50 ++++++++++---------------------------------------- 1 file changed, 10 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..8afa8690d288 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } -- cgit v1.2.3 From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Sep 2021 13:23:22 -0700 Subject: memblock: introduce saner 'memblock_free_ptr()' interface The boot-time allocation interface for memblock is a mess, with 'memblock_alloc()' returning a virtual pointer, but then you are supposed to free it with 'memblock_free()' that takes a _physical_ address. Not only is that all kinds of strange and illogical, but it actually causes bugs, when people then use it like a normal allocation function, and it fails spectacularly on a NULL pointer: https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/ or just random memory corruption if the debug checks don't catch it: https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/ I really don't want to apply patches that treat the symptoms, when the fundamental cause is this horribly confusing interface. I started out looking at just automating a sane replacement sequence, but because of this mix or virtual and physical addresses, and because people have used the "__pa()" macro that can take either a regular kernel pointer, or just the raw "unsigned long" address, it's all quite messy. So this just introduces a new saner interface for freeing a virtual address that was allocated using 'memblock_alloc()', and that was kept as a regular kernel pointer. And then it converts a couple of users that are obvious and easy to test, including the 'xbc_nodes' case in lib/bootconfig.c that caused problems. Reported-by: kernel test robot Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Steven Rostedt Cc: Mike Rapoport Cc: Andrew Morton Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Vlastimil Babka Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; -- cgit v1.2.3